Thursday, October 28, 2010

Simple counter for batch operations

Hello I am working with large transformations now. It is some like:
CREATE TABLE source_table(..);
CREATE TABLE destination_table(..);

/* transformation function */
CREATE OR REPLACE FUNCTION to_destination_table(source_table) RETURNS RECORD AS $$ ...

/* transformation */
INSERT INTO destination_table
   SELECT (r).*
  FROM (SELECT to_destionation_table(_source) r
           FROM source _source) x
The batch operation is relative long. so I had to wrote simple counter. It is simple function, that counts a call and returns input value without any change. My function has a three parameters, first is a value for forwarding, second is a frequency of notification, and third parameter specifies a printing input value:
nic=# select sum(counter(v, 20, false)) from generate_series(1,100) g(v);
NOTICE:  processed 20 rows
NOTICE:  processed 40 rows
NOTICE:  processed 60 rows
NOTICE:  processed 80 rows
NOTICE:  processed 100 rows
 sum  
------
 5050
(1 row)

nic=# select sum(counter(v, 20, true)) from generate_series(1,100) g(v);
NOTICE:  processed 20 rows, current value is '20'
NOTICE:  processed 40 rows, current value is '40'
NOTICE:  processed 60 rows, current value is '60'
NOTICE:  processed 80 rows, current value is '80'
NOTICE:  processed 100 rows, current value is '100'
 sum  
------
 5050
(1 row)
This function is very simple - see source code:
#include "funcapi.h"
#include "utils/lsyscache.h"

PG_FUNCTION_INFO_V1(pst_counter);

Datum pst_counter(PG_FUNCTION_ARGS);

typedef struct
{
 long int iterations;
 int freq;
 Oid  typoutput;
} counter_cache;

/*
 * raise notice every n call,
 * returns input without change
 */
Datum
pst_counter(PG_FUNCTION_ARGS)
{
 Datum value = PG_GETARG_DATUM(0);
 counter_cache *ptr = (counter_cache *) fcinfo->flinfo->fn_extra;

 if (ptr == NULL)
 {

  fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
          sizeof(counter_cache));
  ptr = (counter_cache *) fcinfo->flinfo->fn_extra;
  ptr->iterations = 0;
  ptr->typoutput = InvalidOid;

  if (PG_ARGISNULL(1))
   elog(ERROR, "second parameter (output frequency) must not be NULL");

  ptr->freq = PG_GETARG_INT32(1);

  if (!PG_ARGISNULL(2) && PG_GETARG_BOOL(2))
  {
   Oid valtype;
   Oid typoutput;
   bool typIsVarlena;

   valtype = get_fn_expr_argtype(fcinfo->flinfo, 0);
   getTypeOutputInfo(valtype, &typoutput, &typIsVarlena);
   ptr->typoutput = typoutput;
  }
 }
 
 if (++ptr->iterations % ptr->freq == 0)
 {
  if (!OidIsValid(ptr->typoutput))
  {
   elog(NOTICE, "processed %ld rows", ptr->iterations);
  }
  else
  {
   /* show a processed row, when it's requested */
   if (PG_ARGISNULL(0))
    elog(NOTICE, "processed %ld rows, current value is null", ptr->iterations);
   else
   {
    elog(NOTICE, "processed %ld rows, current value is '%s'", ptr->iterations,
              OidOutputFunctionCall(ptr->typoutput, value));
   }
  }
 }
 

 PG_RETURN_DATUM(value);
}


CREATE OR REPLACE FUNCTION counter(anyelement, int, bool)
RETURNS anyelement
AS 'MODULE_PATHNAME','pst_counter' LANGUAGE C;

And usage isn't complex too:
/* transformation */
INSERT INTO destination_table
   SELECT (r).*
  FROM (SELECT counter(to_destionation_table(_source), 1000, true) r
           FROM source _source) x
real output:
psql84:convert.sql:183: NOTICE:  processed 1230000 rows, current value is '(959674199,"2010-10-25 23:50:29.404568","2010-10-25 23:50:29.491842",,3,501,190000029,XXXXXX,f,9,21)'
psql84:convert.sql:183: NOTICE:  processed 1240000 rows, current value is '(959473959,"2010-10-25 19:20:52.498152","2010-10-25 19:20:52.564384",,3,501,189854563,XXXXXX,f,9,23)'
psql84:convert.sql:183: NOTICE:  processed 1250000 rows, current value is '(958884965,"2010-10-25 08:53:30.026877","2010-10-25 08:53:30.106039",,3,101,189849045,XXXXXX,f,13,48)'
psql84:convert.sql:183: NOTICE:  processed 1260000 rows, current value is '(959483959,"2010-10-25 19:32:02.136491","2010-10-25 19:32:04.070922",,3,500,189969079,XXXXXX,f,9,48)'
psql84:convert.sql:183: NOTICE:  processed 1270000 rows, current value is '(959488959,"2010-10-25 19:36:40.691078","2010-10-25 19:36:41.155615",,3,500,189037026,,f,9,)'
psql84:convert.sql:183: NOTICE:  processed 1280000 rows, current value is '(959493959,"2010-10-25 19:41:59.21508","2010-10-25 19:41:59.244398",,3,101,189971167,XXXXXX,f,13,42)'
psql84:convert.sql:183: NOTICE:  processed 1290000 rows, current value is '(959498959,"2010-10-25 19:49:22.494294","2010-10-25 19:49:22.516124",,3,1010,189971599,XXXXXX,f,9,53)'

Thursday, October 21, 2010

diff string for PostgreSQL

Hello, I am working on migration a large dataset from archive to a new format. This work is joined with searching a small differences between original and transformed data. For this work I implemented a small library for PostgreSQL. It contains a two functions: diff_string and lc_substring.These functions should to support multibyte encoding. I hope, so this can be useful for someone.
postgres=# select lc_substring('Hello World','ello');
 lc_substring 
──────────────
 ello
(1 row)

postgres=# select diff_string('Hello World','ello');
         diff_string         
─────────────────────────────
 <del>H</>ello<del> World</>
(1 row)
This library exists on pgFoundry http://pgfoundry.org/frs/?group_id=1000457

Wednesday, October 13, 2010

use a plpgsql well

Hello

I found a following code on net. It's good example of bad code (written by PHP coder):
01.CREATE OR REPLACE FUNCTION delete_data(IN data integer[]) RETURNS integer AS
02.--DECLARATION OF FUNCTION
03.$$
04.DECLARE
05.--DECLARATION OF LOCAL VARIABLES
06.sql varchar;
07.i integer;
08.BEGIN
09.--START OF THE PROCEDURE
10.i := 0;
11.loop
12.--LOOP THROUGH AOUR ARRAY OF DATA
13.if (data[i][1] IS NULL) then
14.return 1;
15.--IF WE LOOPED THROUG ALL OF ARRAY, IT'S DONE
16.exit;
17.end if;
18.sql := 'DELETE FROM data_table WHERE data_table.id='||data[i];
19.--THE SQL ITSELF, COMBINED WITH VARIABLE
20.execute(sql);
21.--THE EXECUTE FUNCTION BUILT IN PL/pgSQL, EXECUTES VARCHAR SQLs
22.i := i + 1;
23.end loop;
24.--END OF THE PROCEDURE
25. 
26.END;
27.--END OF FUNCTION'S LOGIC
28. 
29.$$
30.LANGUAGE 'plpgsql';
31.--DECLARATION OF LANGUAGE USED
what is wrong:
  • using a dynamic SQL - really, there cannot be a dynamic SQL
  • this code is cryptographic - It needs big fantasy to see iteration over array in this code
  • inconsistency - keywords, useless braces in IF statement (PL/pgSQL isn't PHP)
  • useless return value - not handled case, when input array is empty
why the coder didn't write just?:
01.CREATE OR REPLACE FUNCTION delete_data(IN data integer[]) 
02.RETURNS void AS $$
03.BEGIN
04.  FOR i IN array_lower(data,1)..array_upper(data,1)
05.  LOOP
06.    DELETE FROM data_table WHERE data_table.id = data[i];
07.  END LOOP;
08.END;
09.$$ LANGUAGE plpgsql; 
or better (if you like procedures and dislike SQL in your app. code) (inpiration by Thermic):
  CREATE OR REPLACE FUNCTION delete_data(data integer[])
  RETURNS void AS $$
    DELETE FROM data_table WHERE data_table.id = ANY($1)
  $$ LANGUAGE sql;
note: you can use a plpgsql or sql. Both environments has a few advantages and disadvantages. For using in web environments I little bit prefer SQL language (when it's called once per session, when size of input array can be significantly different).