Patch: regexp_matches variant returning an array of matching positions

Started by Björn Harrtellalmost 12 years ago6 messages

bjorn.harrtell@gmail.com

almost 12 years ago

1 attachment(s)

I've written a variant of regexp_matches called regexp_matches_positions
which instead of returning matching substrings will return matching
positions. I found use of this when processing OCR scanned text and wanted
to prioritize matches based on their position.

The patch is for discussion. I'd also appriciate general suggestions as
this is my first experience with the postgresql code base.

The patch is against the master branch and includes a simple regression
test.

Attachments:

regexp_matches_positions_v1.difftext/plain; charset=US-ASCII; name=regexp_matches_positions_v1.diffDownload

*** /tmp/DQoMjJ_regexp.c	2014-01-28 19:59:37.470271459 +0100
--- src/backend/utils/adt/regexp.c	2014-01-28 19:44:47.298288383 +0100
***************
*** 113,118 ****
--- 113,119 ----
  					 bool ignore_degenerate);
  static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
  static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
+ static ArrayType *build_regexp_matches_positions_result(regexp_matches_ctx *matchctx);
  static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
  
  
***************
*** 833,838 ****
--- 834,898 ----
  	return regexp_matches(fcinfo);
  }
  
+ 
+ /*
+  * regexp_matches_positions()
+  *		Return a table of matched locations of a pattern within a string.
+  */
+ Datum
+ regexp_matches_positions(PG_FUNCTION_ARGS)
+ {
+ 	FuncCallContext *funcctx;
+ 	regexp_matches_ctx *matchctx;
+ 
+ 	if (SRF_IS_FIRSTCALL())
+ 	{
+ 		text	   *pattern = PG_GETARG_TEXT_PP(1);
+ 		text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
+ 		MemoryContext oldcontext;
+ 
+ 		funcctx = SRF_FIRSTCALL_INIT();
+ 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+ 
+ 		/* be sure to copy the input string into the multi-call ctx */
+ 		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
+ 										flags,
+ 										PG_GET_COLLATION(),
+ 										false, true, false);
+ 
+ 		/* Pre-create workspace that build_regexp_matches_positions_result needs */
+ 		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
+ 		matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
+ 
+ 		MemoryContextSwitchTo(oldcontext);
+ 		funcctx->user_fctx = (void *) matchctx;
+ 	}
+ 
+ 	funcctx = SRF_PERCALL_SETUP();
+ 	matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
+ 
+ 	if (matchctx->next_match < matchctx->nmatches)
+ 	{
+ 		ArrayType  *result_ary;
+ 
+ 		result_ary = build_regexp_matches_positions_result(matchctx);
+ 		matchctx->next_match++;
+ 		SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
+ 	}
+ 
+ 	/* release space in multi-call ctx to avoid intraquery memory leak */
+ 	cleanup_regexp_matches(matchctx);
+ 
+ 	SRF_RETURN_DONE(funcctx);
+ }
+ 
+ /* This is separate to keep the opr_sanity regression test from complaining */
+ Datum
+ regexp_matches_positions_no_flags(PG_FUNCTION_ARGS)
+ {
+ 	return regexp_matches_positions(fcinfo);
+ }
+ 
  /*
   * setup_regexp_matches --- do the initial matching for regexp_matches()
   *		or regexp_split()
***************
*** 1035,1040 ****
--- 1095,1140 ----
  }
  
  /*
+  * build_regexp_matches_positions_result - build output array for current match
+  */
+ static ArrayType *
+ build_regexp_matches_positions_result(regexp_matches_ctx *matchctx)
+ {
+ 	Datum	   *elems = matchctx->elems;
+ 	bool	   *nulls = matchctx->nulls;
+ 	int			dims[1];
+ 	int			lbs[1];
+ 	int			loc;
+ 	int			i;
+ 
+ 	/* Extract matching substrings from the original string */
+ 	loc = matchctx->next_match * matchctx->npatterns * 2;
+ 	for (i = 0; i < matchctx->npatterns; i++)
+ 	{
+ 		int			so = matchctx->match_locs[loc++];
+ 		int			eo = matchctx->match_locs[loc++];
+ 
+ 		if (so < 0 || eo < 0)
+ 		{
+ 			elems[i] = (Datum) 0;
+ 			nulls[i] = true;
+ 		}
+ 		else
+ 		{
+ 			elems[i] = Int32GetDatum(so)+1;
+ 			nulls[i] = false;
+ 		}
+ 	}
+ 
+ 	/* And form an array */
+ 	dims[0] = matchctx->npatterns;
+ 	lbs[0] = 1;
+ 	/* XXX: this hardcodes assumptions about the int4 type */
+ 	return construct_md_array(elems, nulls, 1, dims, lbs,
+ 							  INT4OID, 4, true, 'i');
+ }
+ 
+ /*
   * regexp_split_to_table()
   *		Split the string at matches of the pattern, returning the
   *		split-out substrings as a table.
*** /tmp/xPfd4G_pg_proc.h	2014-01-28 19:59:37.478271459 +0100
--- src/include/catalog/pg_proc.h	2014-01-28 19:44:47.298288383 +0100
***************
*** 1899,1904 ****
--- 1899,1908 ----
  DESCR("find all match groups for regexp");
  DATA(insert OID =  2764 ( regexp_matches   PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ regexp_matches _null_ _null_ _null_ ));
  DESCR("find all match groups for regexp");
+ DATA(insert OID =  7769 ( regexp_matches_positions   PGNSP PGUID 12 1 1 0 0 f f f f t t i 2 0 1009 "25 25" _null_ _null_ _null_ _null_ regexp_matches_positions_no_flags _null_ _null_ _null_ ));
+ DESCR("find all match positions for regexp");
+ DATA(insert OID =  7770 ( regexp_matches_positions   PGNSP PGUID 12 1 10 0 0 f f f f t t i 3 0 1009 "25 25 25" _null_ _null_ _null_ _null_ regexp_matches_positions _null_ _null_ _null_ ));
+ DESCR("find all match positions for regexp");
  DATA(insert OID =  2088 ( split_part   PGNSP PGUID 12 1 0 0 0 f f f f t f i 3 0 25 "25 25 23" _null_ _null_ _null_ _null_	split_text _null_ _null_ _null_ ));
  DESCR("split string by field_sep and return field_num");
  DATA(insert OID =  2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 0 0 f f f f t t i 2 0 25 "25 25" _null_ _null_ _null_ _null_	regexp_split_to_table_no_flags _null_ _null_ _null_ ));
*** /tmp/lXOnOH_builtins.h	2014-01-28 19:59:37.490271458 +0100
--- src/include/utils/builtins.h	2014-01-28 19:44:47.302288383 +0100
***************
*** 587,592 ****
--- 587,594 ----
  extern Datum similar_escape(PG_FUNCTION_ARGS);
  extern Datum regexp_matches(PG_FUNCTION_ARGS);
  extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches_positions(PG_FUNCTION_ARGS);
+ extern Datum regexp_matches_no_flags_positions(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
*** /tmp/LiDRpL_strings.out	2014-01-28 19:59:37.498271458 +0100
--- src/test/regress/expected/strings.out	2014-01-28 19:44:47.302288383 +0100
***************
*** 505,510 ****
--- 505,517 ----
  ERROR:  invalid regular expression: parentheses () not balanced
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
  ERROR:  invalid regular expression: invalid repetition count(s)
+ -- return all match positions from regexp
+ SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$);
+  regexp_matches_positions 
+ --------------------------
+  {4,7}
+ (1 row)
+ 
  -- split string on regexp
  SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo;
    foo  | length 
*** /tmp/h7moDJ_strings.sql	2014-01-28 19:59:37.506271458 +0100
--- src/test/regress/sql/strings.sql	2014-01-28 19:44:47.302288383 +0100
***************
*** 170,175 ****
--- 170,178 ----
  SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$);
  SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$);
  
+ -- return all match positions from regexp
+ SELECT regexp_matches_positions('foobarbequebaz', $re$(bar)(beque)$re$);
+ 
  -- split string on regexp
  SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumps over the lazy dog', $re$\s+$re$) AS foo;
  SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', $re$\s+$re$);

Alvaro Herrera

alvherre@2ndquadrant.com

almost 12 years ago

In reply to: Björn Harrtell (#1)

Re: Patch: regexp_matches variant returning an array of matching positions

Bjï¿½rn Harrtell wrote:

I've written a variant of regexp_matches called regexp_matches_positions
which instead of returning matching substrings will return matching
positions. I found use of this when processing OCR scanned text and wanted
to prioritize matches based on their position.

Interesting. I didn't read the patch but I wonder if it would be of
more general applicability to return more info in a fell swoop a
function returning a set (position, length, text of match), rather than
an array. So instead of first calling one function to get the match and
then their positions, do it all in one pass.

(See pg_event_trigger_dropped_objects for a simple example of a function
that returns in that fashion. There are several others but AFAIR that's
the simplest one.)

--
ï¿½lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

David Johnston

polobo@yahoo.com

almost 12 years ago

In reply to: Alvaro Herrera (#2)

Re: Patch: regexp_matches variant returning an array of matching positions

Alvaro Herrera-9 wrote

Björn Harrtell wrote:

I've written a variant of regexp_matches called regexp_matches_positions
which instead of returning matching substrings will return matching
positions. I found use of this when processing OCR scanned text and
wanted
to prioritize matches based on their position.

Interesting. I didn't read the patch but I wonder if it would be of
more general applicability to return more info in a fell swoop a
function returning a set (position, length, text of match), rather than
an array. So instead of first calling one function to get the match and
then their positions, do it all in one pass.

(See pg_event_trigger_dropped_objects for a simple example of a function
that returns in that fashion. There are several others but AFAIR that's
the simplest one.)

Confused as to your thinking. Like regexp_matches this returns "SETOF
type[]". In this case integer but text for the matches. I could see adding
a generic function that returns a SETOF named composite (match varchar[],
position int[], length int[]) and the corresponding type. I'm not imagining
a situation where you'd want the position but not the text and so having to
evaluate the regexp twice seems wasteful. The length is probably a waste
though since it can readily be gotten from the text and is less often
needed. But if it's pre-calculated anyway...

My question is what position is returned in a multiple-match situation? The
supplied test only covers the simple, non-global, situation. It needs to
exercise empty sub-matches and global searches. One theory is that the
first array slot should cover the global position of match zero (i.e., the
entire pattern) within the larger document while sub-matches would be
relative offsets within that single match. This conflicts, though, with the
fact that _matches only returns array elements for () items and never for
the full match - the goal in this function being parallel un-nesting. But as
nesting is allowed it is still possible to have occur.

How does this resolve in the patch?

SELECT regexp_matches('abcabc','((a)(b)(c))','g');

David J.

--
View this message in context: http://postgresql.1045698.n5.nabble.com/Patch-regexp-matches-variant-returning-an-array-of-matching-positions-tp5789321p5789414.html
Sent from the PostgreSQL - hackers mailing list archive at Nabble.com.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Erik Rijkers

er@xs4all.nl

almost 12 years ago

In reply to: David Johnston (#3)

Re: Re: Patch: regexp_matches variant returning an array of matching positions

On Wed, January 29, 2014 05:16, David Johnston wrote:

How does this resolve in the patch?

SELECT regexp_matches('abcabc','((a)(b)(c))','g');

With the patch:

testdb=# SELECT regexp_matches('abcabc','((a)(b)(c))','g'), regexp_matches_positions('abcabc','((a)(b)(c))');
regexp_matches | regexp_matches_positions
----------------+--------------------------
{abc,a,b,c} | {1,1,2,3}
{abc,a,b,c} | {1,1,2,3}
(2 rows)

testdb=# SELECT regexp_matches('abcabc','((a)(b)(c))','g'), regexp_matches_positions('abcabc','((a)(b)(c))', 'g');
regexp_matches | regexp_matches_positions
----------------+--------------------------
{abc,a,b,c} | {1,1,2,3}
{abc,a,b,c} | {4,4,5,6}
(2 rows)

( in HEAD:

testdb=# SELECT regexp_matches('abcabc','((a)(b)(c))','g');
regexp_matches
----------------
{abc,a,b,c}
{abc,a,b,c}
(2 rows)
)

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

David Johnston

polobo@yahoo.com

almost 12 years ago

In reply to: Erik Rijkers (#4)

Re: Patch: regexp_matches variant returning an array of matching positions

Erik Rijkers wrote

On Wed, January 29, 2014 05:16, David Johnston wrote:

How does this resolve in the patch?

SELECT regexp_matches('abcabc','((a)(b)(c))','g');

With the patch:

testdb=# SELECT regexp_matches('abcabc','((a)(b)(c))','g'),
regexp_matches_positions('abcabc','((a)(b)(c))');
regexp_matches | regexp_matches_positions
----------------+--------------------------
{abc,a,b,c} | {1,1,2,3}
{abc,a,b,c} | {1,1,2,3}
(2 rows)

The {1,1,2,3} in the second row is an artifact/copy from
set-value-function-in-select-list repetition and has nothing to do with the
second match.

testdb=# SELECT regexp_matches('abcabc','((a)(b)(c))','g'),
regexp_matches_positions('abcabc','((a)(b)(c))', 'g');
regexp_matches | regexp_matches_positions
----------------+--------------------------
{abc,a,b,c} | {1,1,2,3}
{abc,a,b,c} | {4,4,5,6}
(2 rows)

As expected.

David J.

--
View this message in context: http://postgresql.1045698.n5.nabble.com/Patch-regexp-matches-variant-returning-an-array-of-matching-positions-tp5789321p5789434.html
Sent from the PostgreSQL - hackers mailing list archive at Nabble.com.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Björn Harrtell

bjorn.harrtell@gmail.com

almost 12 years ago

In reply to: David Johnston (#3)

Re: Patch: regexp_matches variant returning an array of matching positions

I'll elaborate on the use case. I have OCR scanned text for a large amounts
of images, corresponding to one row per image. I want to match against
words in another table. I need two results sets, one with all matched words
and one with only the first matched word within the first 50 chars of the
OCR scanned text. Having the matched position in the first result set makes
it easy to produce the second.

I cannot find the position using the substring because I use word
boundaries in my regexp.

Returning a SETOF named composite makes sense, so I could try to make such
a function instead if there is interest. Perhaps a good name for such a
function would be simply regexp_match och regexp_search (as in python).

/Björn

2014-01-29 David Johnston <polobo@yahoo.com>

Show quoted text

Alvaro Herrera-9 wrote

Björn Harrtell wrote:

I've written a variant of regexp_matches called regexp_matches_positions
which instead of returning matching substrings will return matching
positions. I found use of this when processing OCR scanned text and
wanted
to prioritize matches based on their position.

Interesting. I didn't read the patch but I wonder if it would be of
more general applicability to return more info in a fell swoop a
function returning a set (position, length, text of match), rather than
an array. So instead of first calling one function to get the match and
then their positions, do it all in one pass.

(See pg_event_trigger_dropped_objects for a simple example of a function
that returns in that fashion. There are several others but AFAIR that's
the simplest one.)

Confused as to your thinking. Like regexp_matches this returns "SETOF
type[]". In this case integer but text for the matches. I could see
adding
a generic function that returns a SETOF named composite (match varchar[],
position int[], length int[]) and the corresponding type. I'm not
imagining
a situation where you'd want the position but not the text and so having to
evaluate the regexp twice seems wasteful. The length is probably a waste
though since it can readily be gotten from the text and is less often
needed. But if it's pre-calculated anyway...

My question is what position is returned in a multiple-match situation? The
supplied test only covers the simple, non-global, situation. It needs to
exercise empty sub-matches and global searches. One theory is that the
first array slot should cover the global position of match zero (i.e., the
entire pattern) within the larger document while sub-matches would be
relative offsets within that single match. This conflicts, though, with
the
fact that _matches only returns array elements for () items and never for
the full match - the goal in this function being parallel un-nesting. But
as
nesting is allowed it is still possible to have occur.

How does this resolve in the patch?

SELECT regexp_matches('abcabc','((a)(b)(c))','g');

David J.

--
View this message in context:
http://postgresql.1045698.n5.nabble.com/Patch-regexp-matches-variant-returning-an-array-of-matching-positions-tp5789321p5789414.html
Sent from the PostgreSQL - hackers mailing list archive at Nabble.com.