diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 08f08322ca..34a338fb21 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3140,6 +3140,29 @@ repeat('Pg', 4) PgPgPgPg + + + regexp_positions + + regexp_positions ( string text, pattern text [, flags text ] ) + setof int4range[] + + + Returns start and end positions of captured substring(s) resulting from matching a POSIX regular + expression to the string; see + . + + + regexp_positions('foobarbequebaz', 'ba.', 'g') + + + {"[4,7)"} + {"[12,15)"} + + + + + regexp_replace diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index a32c5c82ab..fde3d1b80f 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -37,6 +37,7 @@ #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/varlena.h" +#include "utils/rangetypes.h" #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \ (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL) @@ -118,6 +119,7 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, bool ignore_degenerate, bool fetching_unmatched); static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); +static ArrayType *build_regexp_positions_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); @@ -1056,6 +1058,58 @@ regexp_matches(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* + * regexp_positions() + * Return a table of ranges where a pattern matches within a string. + */ +Datum +regexp_positions(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + regexp_matches_ctx *matchctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_PP(1); + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + pg_re_flags re_flags; + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, + &re_flags, + PG_GET_COLLATION(), + true, false, false); + + /* Pre-create workspace that build_regexp_match_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); + matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + } + + funcctx = SRF_PERCALL_SETUP(); + matchctx = (regexp_matches_ctx *) funcctx->user_fctx; + + if (matchctx->next_match < matchctx->nmatches) + { + ArrayType *result_ary; + + result_ary = build_regexp_positions_result(matchctx); + matchctx->next_match++; + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); + } + + SRF_RETURN_DONE(funcctx); +} + /* This is separate to keep the opr_sanity regression test from complaining */ Datum regexp_matches_no_flags(PG_FUNCTION_ARGS) @@ -1063,6 +1117,13 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) return regexp_matches(fcinfo); } +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_positions_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_positions(fcinfo); +} + /* * setup_regexp_matches --- do the initial matching for regexp_match * and regexp_split functions @@ -1332,6 +1393,64 @@ build_regexp_match_result(regexp_matches_ctx *matchctx) TEXTOID, -1, false, TYPALIGN_INT); } +/* + * build_regexp_positions_result - build output array for current match + */ +static ArrayType * +build_regexp_positions_result(regexp_matches_ctx *matchctx) +{ + Datum *elems = matchctx->elems; + bool *nulls = matchctx->nulls; + int dims[1]; + int lbs[1]; + int loc; + int i; + RangeType *range; + TypeCacheEntry *typcache; + RangeBound lower; + RangeBound upper; + + typcache = lookup_type_cache(INT4RANGEOID, TYPECACHE_RANGE_INFO); + + /* Extract matching substrings from the original string */ + loc = matchctx->next_match * matchctx->npatterns * 2; + for (i = 0; i < matchctx->npatterns; i++) + { + int so = matchctx->match_locs[loc++]; + int eo = matchctx->match_locs[loc++]; + + if (so < 0 || eo < 0) + { + elems[i] = (Datum) 0; + nulls[i] = true; + } + else + { + lower.val = Int32GetDatum(so + 1); + lower.infinite = false; + lower.inclusive = true; + lower.lower = true; + + upper.val = Int32GetDatum(eo); + upper.infinite = false; + upper.inclusive = true; + upper.lower = false; + + range = make_range(typcache, &lower, &upper, false); + + elems[i] = RangeTypePGetDatum(range); + nulls[i] = false; + } + } + + /* And form an array */ + dims[0] = matchctx->npatterns; + lbs[0] = 1; + /* XXX: this hardcodes assumptions about the text type */ + return construct_md_array(elems, nulls, 1, dims, lbs, + INT4RANGEOID, -1, false, TYPALIGN_INT); +} + /* * regexp_split_to_table() * Split the string at matches of the pattern, returning the diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1487710d59..e2e76935a0 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -3557,6 +3557,14 @@ proname => 'regexp_matches', prorows => '10', proretset => 't', prorettype => '_text', proargtypes => 'text text text', prosrc => 'regexp_matches' }, +{ oid => '8104', descr => 'find matching position(s) for regexp', + proname => 'regexp_positions', prorows => '10', proretset => 't', + prorettype => '_int4range', proargtypes => 'text text text', + prosrc => 'regexp_positions' }, +{ oid => '8105', descr => 'find matching position(s) for regexp', + proname => 'regexp_positions', prorows => '10', proretset => 't', + prorettype => '_int4range', proargtypes => 'text text', + prosrc => 'regexp_positions_no_flags' }, { oid => '2088', descr => 'split string by field_sep and return field_num', proname => 'split_part', prorettype => 'text', proargtypes => 'text text int4', prosrc => 'split_part' }, diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index fb4573d85f..5071165cd3 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -601,6 +601,12 @@ SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); {bar,beque} (1 row) +SELECT regexp_positions('foobarbequebaz', $re$(bar)(beque)$re$); + regexp_positions +-------------------- + {"[4,7)","[7,12)"} +(1 row) + -- test case insensitive SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); regexp_matches @@ -616,6 +622,13 @@ SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g') {bazil,barf} (2 rows) +SELECT regexp_positions('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); + regexp_positions +----------------------- + {"[4,7)","[7,12)"} + {"[12,17)","[17,21)"} +(2 rows) + -- empty capture group (matched empty string) SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); regexp_matches diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 57a48c9d0b..aa8b0553f0 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -198,12 +198,14 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -- return all matches from regexp SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); +SELECT regexp_positions('foobarbequebaz', $re$(bar)(beque)$re$); -- test case insensitive SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); -- global option - more than one match SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); +SELECT regexp_positions('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); -- empty capture group (matched empty string) SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$);