diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index fee0561961..b761a49d4e 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3140,6 +3140,31 @@ repeat('Pg', 4) PgPgPgPg + + + regexp_positions + + regexp_positions ( string text, pattern text [, flags text ] ) + setof record + ( start_pos integer[], + end_pos integer[] ) + + + Returns start and end positions of captured substring(s) resulting from matching a POSIX regular + expression to the string; see + . + + + regexp_positions('foobarbequebaz', 'ba.', 'g') + + + ({3},{6}) + ({11},{14}) + + + + + regexp_replace diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index a32c5c82ab..af3564e3e9 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -118,6 +118,7 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, bool ignore_degenerate, bool fetching_unmatched); static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); +static Datum build_regexp_positions_result(regexp_matches_ctx *matchctx, TupleDesc tupdesc); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); @@ -1056,6 +1057,66 @@ regexp_matches(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * regexp_positions() + * Return a setof record of positions where a pattern matches within a string. + */ +Datum +regexp_positions(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + regexp_matches_ctx *matchctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_PP(1); + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + pg_re_flags re_flags; + MemoryContext oldcontext; + TupleDesc tupdesc; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "start_pos", + INT4ARRAYOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "end_pos", + INT4ARRAYOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, + &re_flags, + PG_GET_COLLATION(), + true, false, false); + + /* Pre-create workspace that build_regexp_positions_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns * 2); + matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + } + + funcctx = SRF_PERCALL_SETUP(); + matchctx = (regexp_matches_ctx *) funcctx->user_fctx; + + if (matchctx->next_match < matchctx->nmatches) + { + Datum tuple; + + tuple = build_regexp_positions_result(matchctx, funcctx->tuple_desc); + matchctx->next_match++; + SRF_RETURN_NEXT(funcctx, tuple); + } + + SRF_RETURN_DONE(funcctx); +} + /* This is separate to keep the opr_sanity regression test from complaining */ Datum regexp_matches_no_flags(PG_FUNCTION_ARGS) @@ -1063,6 +1124,13 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) return regexp_matches(fcinfo); } +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_positions_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_positions(fcinfo); +} + /* * setup_regexp_matches --- do the initial matching for regexp_match * and regexp_split functions @@ -1332,6 +1400,59 @@ build_regexp_match_result(regexp_matches_ctx *matchctx) TEXTOID, -1, false, TYPALIGN_INT); } +/* + * build_regexp_positions_result - build output array for current match + */ +static Datum +build_regexp_positions_result(regexp_matches_ctx *matchctx, TupleDesc tupdesc) +{ + Datum *elems = matchctx->elems; + bool *nulls = matchctx->nulls; + int dims[1]; + int lbs[1]; + int loc; + int i; + ArrayType *so_ary; + ArrayType *eo_ary; + Datum values[2]; + bool tuple_nulls[2] = {false, false}; + HeapTuple tuple; + + /* Extract matching positions from the original string */ + loc = matchctx->next_match * matchctx->npatterns * 2; + for (i = 0; i < matchctx->npatterns; i++) + { + int so = matchctx->match_locs[loc++]; + int eo = matchctx->match_locs[loc++]; + + if (so < 0 || eo < 0) + { + elems[i] = (Datum) 0; + elems[i + matchctx->npatterns] = (Datum) 0; + nulls[i] = true; + } + else + { + elems[i] = Int32GetDatum(so); + elems[i + matchctx->npatterns] = Int32GetDatum(eo); + nulls[i] = false; + } + } + + /* And form two arrays */ + dims[0] = matchctx->npatterns; + lbs[0] = 1; + so_ary = construct_md_array(elems, nulls, 1, dims, lbs, + INT4OID, 4, true, TYPALIGN_INT); + eo_ary = construct_md_array(elems + matchctx->npatterns, nulls, 1, dims, lbs, + INT4OID, 4, true, TYPALIGN_INT); + values[0] = PointerGetDatum(so_ary); + values[1] = PointerGetDatum(eo_ary); + tuple = heap_form_tuple(tupdesc, values, tuple_nulls); + return HeapTupleGetDatum(tuple); + +} + /* * regexp_split_to_table() * Split the string at matches of the pattern, returning the diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 59d2b71ca9..51ac045b70 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3560,6 +3560,23 @@ proname => 'regexp_matches', prorows => '10', proretset => 't', prorettype => '_text', proargtypes => 'text text text', prosrc => 'regexp_matches' }, + +{ oid => '8104', descr => 'find matching position(s) for regexp', + proname => 'regexp_positions', prorows => '10', + proretset => 't', prorettype => 'record', + proargtypes => 'text text', proallargtypes => '{text,text,_int4,_int4}', + proargmodes => '{i,i,o,o}', + proargnames => '{string,pattern,start_pos,end_pos}', + prosrc => 'regexp_positions_no_flags' }, + +{ oid => '8105', descr => 'find matching position(s) for regexp', + proname => 'regexp_positions', prorows => '10', + proretset => 't', prorettype => 'record', + proargtypes => 'text text text', proallargtypes => '{text,text,text,_int4,_int4}', + proargmodes => '{i,i,i,o,o}', + proargnames => '{string,pattern,flags,start_pos,end_pos}', + prosrc => 'regexp_positions' }, + { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', proargtypes => 'text text int4', prosrc => 'split_part' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index fb4573d85f..adcc2bea50 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -601,6 +601,12 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); {bar,beque} (1 row) +SELECT regexp_positions('foobarbequebaz', $re$(bar)(beque)$re$); + regexp_positions +-------------------- + ("{3,6}","{6,11}") +(1 row) + -- test case insensitive SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); regexp_matches @@ -616,6 +622,13 @@ SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g') {bazil,barf} (2 rows) +SELECT regexp_positions('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); + regexp_positions +----------------------- + ("{3,6}","{6,11}") + ("{11,16}","{16,20}") +(2 rows) + -- empty capture group (matched empty string) SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); regexp_matches diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 57a48c9d0b..aa8b0553f0 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -198,12 +198,14 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -- return all matches from regexp SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); +SELECT regexp_positions('foobarbequebaz', $re$(bar)(beque)$re$); -- test case insensitive SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); -- global option - more than one match SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); +SELECT regexp_positions('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); -- empty capture group (matched empty string) SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);