Wildcard search support for pg_trgm
Hackers,
Here is first version of patch, which enable index support of wildcard
search in pg_trgm contrib module. The idea of the patch is to extract from
wildcard trigrams which should occurs in wildcard matching string. For
example, for '%sector%' wildcard such trigrams would be: 'sec', 'ect',
'tor'.
create table words (word text);
copy words from '/usr/share/dict/american-english';
test=# explain analyze select * from words where word ilike '%independ%';
QUERY PLAN
------------------------------------------------------------------------------------------------------
Seq Scan on words (cost=0.00..1703.11 rows=10 width=9) (actual
time=18.818..174.146 rows=7 loops=1)
Filter: (word ~~* '%independ%'::text)
Total runtime: 174.200 ms
(3 rows)
CREATE INDEX trgm_idx ON words USING gist (word gist_trgm_ops);
test=# explain analyze select * from words where word ilike '%independ%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on words (cost=4.36..40.11 rows=10 width=9) (actual
time=2.445..2.529 rows=7 loops=1)
Recheck Cond: (word ~~* '%independ%'::text)
-> Bitmap Index Scan on trgm_idx (cost=0.00..4.35 rows=10 width=0)
(actual time=2.406..2.406 rows=7 loops=1)
Index Cond: (word ~~* '%independ%'::text)
Total runtime: 2.612 ms
(5 rows)
CREATE INDEX trgm_idx ON words USING gin (word gin_trgm_ops);
test=# explain analyze select * from words where word ilike '%independ%';
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on words (cost=76.08..111.83 rows=10 width=9) (actual
time=2.675..2.755 rows=7 loops=1)
Recheck Cond: (word ~~* '%independ%'::text)
-> Bitmap Index Scan on trgm_idx (cost=0.00..76.07 rows=10 width=0)
(actual time=2.642..2.642 rows=7 loops=1)
Index Cond: (word ~~* '%independ%'::text)
Total runtime: 2.839 ms
(5 rows)
I've encountered with following problems:
1) Indexing support for ilike is possible only with case-insensetive
wildcards, e.g. when IGNORECASE macro is enabled. But I can't use this macro
in pg_trgm.sql.in, where list of operators is defined. Probably, is it
enuogh to put comment near IGNORECASE, which tells that if one disable this
macro he should also remove oparators from pg_trgm.sql.in?
2) I found gist index not very useful with default SIGLENINT = 3. I've
changed this value to 15 and I found gist index performs very good on
dictionary. But on longer strings greater values of SIGLENINT may be
required (probably even SIGLENINT > 122 will give benefit in some cases in
spite of TOAST).
----
With best regards,
Alexander Korotkov.
Attachments:
trgm_wildcard-0.1.patchtext/x-patch; charset=US-ASCII; name=trgm_wildcard-0.1.patchDownload
*** a/contrib/pg_trgm/pg_trgm.sql.in
--- b/contrib/pg_trgm/pg_trgm.sql.in
***************
*** 113,118 **** FOR TYPE text USING gist
--- 113,120 ----
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
***************
*** 144,149 **** CREATE OPERATOR CLASS gin_trgm_ops
--- 146,153 ----
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
*** a/contrib/pg_trgm/trgm.h
--- b/contrib/pg_trgm/trgm.h
***************
*** 19,24 ****
--- 19,26 ----
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
+ #define LikeStrategyNumber 3
+ #define ILikeStrategyNumber 4
typedef char trgm[3];
***************
*** 53,59 **** typedef struct
/* gist */
#define BITBYTE 8
! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
--- 55,61 ----
/* gist */
#define BITBYTE 8
! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
***************
*** 89,94 **** typedef char *BITVECP;
--- 91,101 ----
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
+ TRGM *generate_wildcard_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+ bool trgm_contain(TRGM *trg1, TRGM *trg2);
+
+ #define ISESCAPECHAR(x) (*(x) == '\\')
+ #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')
#endif /* __TRGM_H__ */
*** a/contrib/pg_trgm/trgm_gin.c
--- b/contrib/pg_trgm/trgm_gin.c
***************
*** 6,11 ****
--- 6,12 ----
#include "trgm.h"
#include "access/gin.h"
+ #include "access/skey.h"
#include "access/itup.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
***************
*** 24,36 **** gin_extract_trgm(PG_FUNCTION_ARGS)
{
text *val = (text *) PG_GETARG_TEXT_P(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL;
TRGM *trg;
int4 trglen;
*nentries = 0;
! trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
trglen = ARRNELEM(trg);
if (trglen > 0)
--- 25,56 ----
{
text *val = (text *) PG_GETARG_TEXT_P(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy;
Datum *entries = NULL;
TRGM *trg;
int4 trglen;
+ if (fcinfo->nargs == 2)
+ strategy = SimilarityStrategyNumber;
+ else
+ strategy = PG_GETARG_UINT16(2);
+
*nentries = 0;
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! trg = NULL; /* keep compiler quiet */
! break;
! }
trglen = ARRNELEM(trg);
if (trglen > 0)
***************
*** 71,77 **** gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
/* text *query = PG_GETARG_TEXT_P(2); */
/* int32 nkeys = PG_GETARG_INT32(3); */
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
--- 91,97 ----
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
/* int32 nkeys = PG_GETARG_INT32(3); */
Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
***************
*** 81,100 **** gin_trgm_consistent(PG_FUNCTION_ARGS)
trglen,
ntrue = 0;
/* All cases served by this function are inexact */
*recheck = true;
trglen = *(int32 *) extra_data;
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
!
PG_RETURN_BOOL(res);
}
--- 101,144 ----
trglen,
ntrue = 0;
+ #ifndef IGNORECASE
+ if (strategy == ILIKE_STRATEGY)
+ {
+ elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
+ }
+ #endif
/* All cases served by this function are inexact */
*recheck = true;
trglen = *(int32 *) extra_data;
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! res = true;
! for (i = 0; i < trglen; i++)
! if (!check[i])
! {
! res = false;
! break;
! }
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! res = false; /* keep compiler quiet */
! break;
! }
PG_RETURN_BOOL(res);
}
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra;
!
! /* All cases served by this function are exact */
! *recheck = false;
! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
{
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
! memcpy(cache, query, VARSIZE(query));
! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
--- 195,248 ----
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra,
! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! #ifndef IGNORECASE
! if (strategy == ILIKE_STRATEGY)
! {
! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
! }
! #endif
! if (cache == NULL || strategy != *((StrategyNumber *)cache) ||
! VARSIZE(cacheContents) != VARSIZE(query) ||
! memcmp(cacheContents, query, VARSIZE(query)) != 0)
{
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! qtrg = NULL; /* keep compiler quiet */
! break;
! }
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
+ cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! memcpy(cache, &strategy, sizeof(StrategyNumber));
! memcpy(cacheContents, query, VARSIZE(query));
! memcpy(cacheContents + MAXALIGN(VARSIZE(query)),
! qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
+ *recheck = false;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
***************
*** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS)
--- 265,298 ----
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ *recheck = true;
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = trgm_contain(qtrg, key);
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 k, tmp = 0, len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ BITVECP sign = GETSIGN(key);
+ res = true;
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ if (!GETBIT(sign, HASHVAL(tmp)))
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
*** a/contrib/pg_trgm/trgm_op.c
--- b/contrib/pg_trgm/trgm_op.c
***************
*** 236,241 **** generate_trgm(char *str, int slen)
--- 236,404 ----
return trg;
}
+ static char *
+ get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen)
+ {
+ char *beginword = str, *endword, *s = buf;
+ bool in_wildcard = false, in_escape = false;
+ int clen;
+
+ while (beginword - str < lenstr)
+ {
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(beginword)) break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(beginword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(beginword))
+ in_wildcard = true;
+ else if (iswordchr(beginword))
+ break;
+ else
+ in_wildcard = false;
+ }
+ beginword += pg_mblen(beginword);
+ }
+ *charlen = 0;
+ if (!in_wildcard)
+ {
+ if (LPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (LPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ if (beginword - str >= lenstr)
+ return NULL;
+
+ endword = beginword;
+ in_wildcard = false;
+ in_escape = false;
+ while (endword - str < lenstr)
+ {
+ clen = pg_mblen(endword);
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(endword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(endword))
+ {
+ in_wildcard = true;
+ break;
+ }
+ else if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ {
+ in_wildcard = false;
+ break;
+ }
+ }
+ endword += clen;
+ }
+ if (!in_wildcard)
+ {
+ if (RPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (RPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ *bytelen = s - buf;
+ return endword;
+ }
+
+ TRGM *
+ generate_wildcard_trgm(char *str, int slen)
+ {
+ TRGM *trg;
+ char *buf,
+ *buf2;
+ trgm *tptr;
+ int len,
+ charlen,
+ bytelen;
+ char *eword;
+
+ trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
+
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return trg;
+
+ tptr = GETARR(trg);
+
+ buf = palloc(sizeof(char) * (slen + 4));
+
+ eword = str;
+ while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ buf, &bytelen, &charlen)) != NULL)
+ {
+ #ifdef IGNORECASE
+ buf2 = lowerstr_with_len(buf, bytelen);
+ bytelen = strlen(buf2);
+ #else
+ buf2 = buf;
+ #endif
+
+ /*
+ * count trigrams
+ */
+ tptr = make_trigrams(tptr, buf, bytelen, charlen);
+ #ifdef IGNORECASE
+ pfree(buf2);
+ #endif
+ }
+
+ pfree(buf);
+
+ if ((len = tptr - GETARR(trg)) == 0)
+ return trg;
+
+ if (len > 0)
+ {
+ qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ len = unique_array(GETARR(trg), len);
+ }
+
+ SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+ return trg;
+ }
+
uint32
trgm2int(trgm *ptr)
{
***************
*** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2)
--- 503,544 ----
}
+ bool
+ trgm_contain(TRGM *trg1, TRGM *trg2)
+ {
+ trgm *ptr1,
+ *ptr2;
+ int count = 0;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(trg1);
+ ptr2 = GETARR(trg2);
+
+ len1 = ARRNELEM(trg1);
+ len2 = ARRNELEM(trg2);
+
+ while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ {
+ int res = CMPTRGM(ptr1, ptr2);
+
+ if (res < 0)
+ return false;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ count++;
+ }
+ }
+ if (ptr1 - GETARR(trg1) < len1)
+ return false;
+ else
+ return true;
+ }
+
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
I found another problem. GIN index suffers from "GIN indexes do not support
whole-index scans" when no trigram can be extracted from pattern.
----
With best regards,
Alexander Korotkov.
Alexander Korotkov <aekorotkov@gmail.com> writes:
Here is first version of patch, which enable index support of wildcard
search in pg_trgm contrib module.
How different (and better) is it from wildspeed?
http://www.sai.msu.su/~megera/wiki/wildspeed
Regards,
--
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support
On Mon, Dec 13, 2010 at 12:14 AM, Dimitri Fontaine
<dimitri@2ndquadrant.fr>wrote:
How different (and better) is it from wildspeed?
The general advantage is possibility of usage wildcard search and trigram
similarity search using the same index.
I expect that GIN trigram index is slightly less space demanding, but
slightly slower on search than wildspeed. Also, I expect GiST trigram index
to be slower on search, but faster on updates. While I didn't check these
assumptions in details.
I've lack of test datasets for sufficient testing, and I would like to ask
community to help me with it. Because testing on dictionaries is good,
but obviously
not enough.
----
With best regards,
Alexander Korotkov.
I updated my patch to make it use full index scan in GIN index which is
possible thanks to recent Tom Lane patch. Now wildcard, where no trigram can
be extracted from, invokes full index scan, which is slow but correct.
test=# explain (analyze, buffers) select * from words where word ilike
'%in%';
QUERY PLAN
------------------------------------------------------------------------------------------------------------
Seq Scan on words (cost=0.00..1703.11 rows=15930 width=9) (actual
time=0.333..225.817 rows=16558 loops=1)
Filter: (word ~~* '%in%'::text)
Buffers: shared read=471
Total runtime: 248.207 ms
(4 rows)
test=# set enable_seqscan = off;
SET
test=# explain (analyze, buffers) select * from words where word ilike
'%in%';
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------
Bitmap Heap Scan on words (cost=2287.46..2957.59 rows=15930 width=9)
(actual time=122.239..331.993
rows=16558 loops=1)
Recheck Cond: (word ~~* '%in%'::text)
Buffers: shared hit=472 read=1185
-> Bitmap Index Scan on trgm_idx (cost=0.00..2283.48 rows=15930
width=0) (actual time=122.022..122.022 rows=98569 loops=1)
Index Cond: (word ~~* '%in%'::text)
Buffers: shared hit=1 read=1185
Total runtime: 354.409 ms
(7 rows)
As an alternative solution I can propose to extract null item from every
string and ivoke scan on that item instead of full index scan. It requires
to store additional item per each string but it makes full scan fast.
Also I found a segfault when execute the query above and
switch enable_seqscan few times on line "*searchMode =
GIN_SEARCH_MODE_ALL;". Is it a bug in GIN or I'm missing something?
Here goes backtrace from gdb:
#0 0xb4ead070 in gin_extract_query_trgm (fcinfo=0xbfcd8da8) at
trgm_gin.c:112
#1 0x08323a84 in OidFunctionCall5 (functionId=32802, arg1=161269768,
arg2=3217920208, arg3=4,
arg4=3217920204, arg5=3217920200) at fmgr.c:1687
#2 0x082c5654 in gincostestimate (fcinfo=0xbfcd9148) at selfuncs.c:6466
#3 0x083235d8 in OidFunctionCall9 (functionId=2741, arg1=161270176,
arg2=161271296,
arg3=161824624, arg4=0, arg5=0, arg6=3217921064, arg7=3217921056,
arg8=3217921048,
arg9=3217921040) at fmgr.c:1840
#4 0x081f3397 in cost_index (path=0x9a55050, root=0x99cc9a0,
index=0x99cce00,
indexQuals=0x9a53f70, indexOrderBys=0x0, outer_rel=0x0) at
costsize.c:268
#5 0x08216b66 in create_index_path (root=0x99cc9a0, index=0x99cce00,
clause_groups=0x9a53f88,
indexorderbys=0x0, pathkeys=0x0, indexscandir=NoMovementScanDirection,
outer_rel=0x0)
at pathnode.c:511
#6 0x081f7ef5 in find_usable_indexes (root=<value optimized out>,
rel=<value optimized out>,
clauses=<value optimized out>, outer_clauses=0x0, istoplevel=1 '\001',
outer_rel=0x0,
saop_control=SAOP_FORBID, scantype=ST_ANYSCAN) at indxpath.c:422
#7 0x081f8e38 in create_index_paths (root=0x99cc9a0, rel=0x99ccc30) at
indxpath.c:176
#8 0x081eec22 in set_plain_rel_pathlist (root=<value optimized out>,
rel=<value optimized out>,
rti=<value optimized out>, rte=0x99cc650) at allpaths.c:262
#9 set_rel_pathlist (root=<value optimized out>, rel=<value optimized
out>,
rti=<value optimized out>, rte=0x99cc650) at allpaths.c:202
#10 0x081efa55 in set_base_rel_pathlists (root=0x99cc9a0,
joinlist=0x99ccde8) at allpaths.c:158
#11 make_one_rel (root=0x99cc9a0, joinlist=0x99ccde8) at allpaths.c:94
#12 0x08203ef7 in query_planner (root=0x99cc9a0, tlist=0x99ccb00,
tuple_fraction=0,
limit_tuples=-1, cheapest_path=0xbfcd98cc, sorted_path=0xbfcd98c8,
num_groups=0xbfcd98c0)
at planmain.c:271
#13 0x08205b86 in grouping_planner (root=0x99cc9a0, tuple_fraction=0) at
planner.c:1182
#14 0x08207609 in subquery_planner (glob=0x99cc910, parse=0x99cc5a0,
parent_root=0x0,
hasRecursion=0 '\000', tuple_fraction=0, subroot=0xbfcd9a7c) at
planner.c:536
#15 0x08207ca6 in standard_planner (parse=0x99cc5a0, cursorOptions=0,
boundParams=0x0)
at planner.c:201
#16 0x0825db11 in pg_plan_query (querytree=0x99cc5a0, cursorOptions=0,
boundParams=0x0)
at postgres.c:764
#17 0x0815a824 in ExplainOneQuery (stmt=0x9a258e0,
queryString=0x9a24c60 "explain (analyze, buffers) select * from words
where word ilike '%in%';",---Type <return> to continue, or q <return> to
quit---
params=0x0, dest=0x9a32330) at explain.c:300
#18 ExplainQuery (stmt=0x9a258e0,
queryString=0x9a24c60 "explain (analyze, buffers) select * from words
where word ilike '%in%';", params=0x0, dest=0x9a32330) at explain.c:209
#19 0x08261266 in PortalRunUtility (portal=0x9a4d6a8,
utilityStmt=0x9a258e0,
isTopLevel=<value optimized out>, dest=0x9a32330,
completionTag=0xbfcd9bcc "") at pquery.c:1191
#20 0x082622a4 in FillPortalStore (portal=0x9a4d6a8, isTopLevel=32 ' ') at
pquery.c:1065
#21 0x0826281a in PortalRun (portal=0x9a4d6a8, count=2147483647,
isTopLevel=-56 '\310',
dest=0x9a25ee8, altdest=0x9a25ee8, completionTag=0xbfcd9d9c "") at
pquery.c:791
#22 0x0825ec78 in exec_simple_query (query_string=<value optimized out>) at
postgres.c:1059
#23 0x0825fe01 in PostgresMain (argc=2, argv=0x99aeb68, username=0x99aead8
"smagen")
at postgres.c:3939
#24 0x08229250 in BackendRun () at postmaster.c:3577
#25 BackendStartup () at postmaster.c:3262
#26 ServerLoop () at postmaster.c:1448
#27 0x0822be12 in PostmasterMain (argc=3, argv=0x99acc58) at
postmaster.c:1109
#28 0x081ce3b7 in main (argc=3, argv=0x99acc58) at main.c:199
----
With best regards,
Alexander Korotkov.
Attachments:
trgm_wildcard-0.2.patchtext/x-patch; charset=US-ASCII; name=trgm_wildcard-0.2.patchDownload
*** a/contrib/pg_trgm/pg_trgm.sql.in
--- b/contrib/pg_trgm/pg_trgm.sql.in
***************
*** 113,118 **** FOR TYPE text USING gist
--- 113,120 ----
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
***************
*** 129,140 **** RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
--- 131,142 ----
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
***************
*** 144,151 **** CREATE OPERATOR CLASS gin_trgm_ops
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
STORAGE int4;
--- 146,155 ----
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
STORAGE int4;
*** a/contrib/pg_trgm/trgm.h
--- b/contrib/pg_trgm/trgm.h
***************
*** 19,24 ****
--- 19,26 ----
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
+ #define LikeStrategyNumber 3
+ #define ILikeStrategyNumber 4
typedef char trgm[3];
***************
*** 53,59 **** typedef struct
/* gist */
#define BITBYTE 8
! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
--- 55,61 ----
/* gist */
#define BITBYTE 8
! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
***************
*** 89,94 **** typedef char *BITVECP;
--- 91,101 ----
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
+ TRGM *generate_wildcard_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+ bool trgm_contain(TRGM *trg1, TRGM *trg2);
+
+ #define ISESCAPECHAR(x) (*(x) == '\\')
+ #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')
#endif /* __TRGM_H__ */
*** a/contrib/pg_trgm/trgm_gin.c
--- b/contrib/pg_trgm/trgm_gin.c
***************
*** 6,11 ****
--- 6,12 ----
#include "trgm.h"
#include "access/gin.h"
+ #include "access/skey.h"
#include "access/itup.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
***************
*** 16,21 ****
--- 17,25 ----
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+ Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
+
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
***************
*** 50,68 **** gin_extract_trgm(PG_FUNCTION_ARGS)
ptr++;
}
! if (PG_NARGS() > 4)
! {
! /*
! * Function called from query extracting
! */
! Pointer **extra_data = (Pointer **) PG_GETARG_POINTER(4);
! *extra_data = (Pointer *) palloc0(sizeof(Pointer) * (*nentries));
! *(int32 *) (*extra_data) = trglen;
}
}
PG_RETURN_POINTER(entries);
}
--- 54,116 ----
ptr++;
}
! }
!
! PG_RETURN_POINTER(entries);
! }
!
! Datum
! gin_extract_query_trgm(PG_FUNCTION_ARGS)
! {
! text *val = (text *) PG_GETARG_TEXT_P(0);
! int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
! StrategyNumber strategy = PG_GETARG_UINT16(2);
! Datum *entries = NULL;
! TRGM *trg;
! int4 trglen;
! int32 **extra_data = (int32 **) PG_GETARG_POINTER(4);
! int32 *searchMode = (int32 *)PG_GETARG_POINTER(6);
! trgm *ptr;
! int4 i = 0,
! item;
!
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! trg = NULL; /* keep compiler quiet */
! break;
! }
! trglen = ARRNELEM(trg);
! *nentries = (int32) trglen;
! if (trglen > 0)
! {
! entries = (Datum *) palloc(sizeof(Datum) * trglen);
! ptr = GETARR(trg);
! while (ptr - GETARR(trg) < ARRNELEM(trg))
! {
! item = trgm2int(ptr);
! entries[i++] = Int32GetDatum(item);
!
! ptr++;
}
}
+ *extra_data = (int32 *) palloc0(sizeof(int32));
+ **extra_data = trglen;
+
+ if (trglen == 0)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
PG_RETURN_POINTER(entries);
}
***************
*** 71,100 **** gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
/* text *query = PG_GETARG_TEXT_P(2); */
/* int32 nkeys = PG_GETARG_INT32(3); */
! Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4);
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int4 i,
trglen,
ntrue = 0;
/* All cases served by this function are inexact */
*recheck = true;
! trglen = *(int32 *) extra_data;
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
!
PG_RETURN_BOOL(res);
}
--- 119,172 ----
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
/* int32 nkeys = PG_GETARG_INT32(3); */
! int32 *extra_data = (int32 *) PG_GETARG_POINTER(4);
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int4 i,
trglen,
ntrue = 0;
+ #ifndef IGNORECASE
+ if (strategy == ILIKE_STRATEGY)
+ {
+ elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
+ }
+ #endif
/* All cases served by this function are inexact */
*recheck = true;
! trglen = *extra_data;
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! res = true;
! for (i = 0; i < trglen; i++)
! if (!check[i])
! {
! res = false;
! break;
! }
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! res = false; /* keep compiler quiet */
! break;
! }
PG_RETURN_BOOL(res);
}
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra;
!
! /* All cases served by this function are exact */
! *recheck = false;
! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
{
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
! memcpy(cache, query, VARSIZE(query));
! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
--- 195,248 ----
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra,
! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! #ifndef IGNORECASE
! if (strategy == ILIKE_STRATEGY)
! {
! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
! }
! #endif
! if (cache == NULL || strategy != *((StrategyNumber *)cache) ||
! VARSIZE(cacheContents) != VARSIZE(query) ||
! memcmp(cacheContents, query, VARSIZE(query)) != 0)
{
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! qtrg = NULL; /* keep compiler quiet */
! break;
! }
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
+ cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! memcpy(cache, &strategy, sizeof(StrategyNumber));
! memcpy(cacheContents, query, VARSIZE(query));
! memcpy(cacheContents + MAXALIGN(VARSIZE(query)),
! qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
+ *recheck = false;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
***************
*** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS)
--- 265,298 ----
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ *recheck = true;
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = trgm_contain(qtrg, key);
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 k, tmp = 0, len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ BITVECP sign = GETSIGN(key);
+ res = true;
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ if (!GETBIT(sign, HASHVAL(tmp)))
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
*** a/contrib/pg_trgm/trgm_op.c
--- b/contrib/pg_trgm/trgm_op.c
***************
*** 236,241 **** generate_trgm(char *str, int slen)
--- 236,404 ----
return trg;
}
+ static char *
+ get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen)
+ {
+ char *beginword = str, *endword, *s = buf;
+ bool in_wildcard = false, in_escape = false;
+ int clen;
+
+ while (beginword - str < lenstr)
+ {
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(beginword)) break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(beginword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(beginword))
+ in_wildcard = true;
+ else if (iswordchr(beginword))
+ break;
+ else
+ in_wildcard = false;
+ }
+ beginword += pg_mblen(beginword);
+ }
+ *charlen = 0;
+ if (!in_wildcard)
+ {
+ if (LPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (LPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ if (beginword - str >= lenstr)
+ return NULL;
+
+ endword = beginword;
+ in_wildcard = false;
+ in_escape = false;
+ while (endword - str < lenstr)
+ {
+ clen = pg_mblen(endword);
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(endword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(endword))
+ {
+ in_wildcard = true;
+ break;
+ }
+ else if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ {
+ in_wildcard = false;
+ break;
+ }
+ }
+ endword += clen;
+ }
+ if (!in_wildcard)
+ {
+ if (RPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (RPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ *bytelen = s - buf;
+ return endword;
+ }
+
+ TRGM *
+ generate_wildcard_trgm(char *str, int slen)
+ {
+ TRGM *trg;
+ char *buf,
+ *buf2;
+ trgm *tptr;
+ int len,
+ charlen,
+ bytelen;
+ char *eword;
+
+ trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
+
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return trg;
+
+ tptr = GETARR(trg);
+
+ buf = palloc(sizeof(char) * (slen + 4));
+
+ eword = str;
+ while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ buf, &bytelen, &charlen)) != NULL)
+ {
+ #ifdef IGNORECASE
+ buf2 = lowerstr_with_len(buf, bytelen);
+ bytelen = strlen(buf2);
+ #else
+ buf2 = buf;
+ #endif
+
+ /*
+ * count trigrams
+ */
+ tptr = make_trigrams(tptr, buf, bytelen, charlen);
+ #ifdef IGNORECASE
+ pfree(buf2);
+ #endif
+ }
+
+ pfree(buf);
+
+ if ((len = tptr - GETARR(trg)) == 0)
+ return trg;
+
+ if (len > 0)
+ {
+ qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ len = unique_array(GETARR(trg), len);
+ }
+
+ SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+ return trg;
+ }
+
uint32
trgm2int(trgm *ptr)
{
***************
*** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2)
--- 503,544 ----
}
+ bool
+ trgm_contain(TRGM *trg1, TRGM *trg2)
+ {
+ trgm *ptr1,
+ *ptr2;
+ int count = 0;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(trg1);
+ ptr2 = GETARR(trg2);
+
+ len1 = ARRNELEM(trg1);
+ len2 = ARRNELEM(trg2);
+
+ while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ {
+ int res = CMPTRGM(ptr1, ptr2);
+
+ if (res < 0)
+ return false;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ count++;
+ }
+ }
+ if (ptr1 - GETARR(trg1) < len1)
+ return false;
+ else
+ return true;
+ }
+
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
*** a/contrib/pg_trgm/uninstall_pg_trgm.sql
--- b/contrib/pg_trgm/uninstall_pg_trgm.sql
***************
*** 27,35 **** DROP OPERATOR CLASS gin_trgm_ops USING gin;
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
DROP OPERATOR % (text, text);
--- 27,35 ----
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
DROP OPERATOR % (text, text);
On 08/01/11 23:37, Alexander Korotkov wrote:
I updated my patch to make it use full index scan in GIN index which is
possible thanks to recent Tom Lane patch. Now wildcard, where no trigram can
be extracted from, invokes full index scan, which is slow but correct.
Hi,
unfortunately, this change made the patch not apply:
http://git.postgresql.org/gitweb?p=postgresql.git;a=commit;h=be0c3ea2d30ba225f0249ae88d6b0bdf3b753162
I'm getting rejects in trgm_gin.c. Could you update the patch please?
Cheers,
Jan
Hi,
Here is updated version of this patch.
----
With best regards,
Alexander Korotkov.
Attachments:
trgm_wildcard-0.3.patchtext/x-patch; charset=US-ASCII; name=trgm_wildcard-0.3.patchDownload
*** a/contrib/pg_trgm/pg_trgm.sql.in
--- b/contrib/pg_trgm/pg_trgm.sql.in
***************
*** 113,118 **** FOR TYPE text USING gist
--- 113,120 ----
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
***************
*** 129,140 **** RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
--- 131,142 ----
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
! CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
***************
*** 144,151 **** CREATE OPERATOR CLASS gin_trgm_ops
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
STORAGE int4;
--- 146,155 ----
FOR TYPE text USING gin
AS
OPERATOR 1 % (text, text),
+ OPERATOR 3 ~~ (text, text),
+ OPERATOR 4 ~~* (text, text),
FUNCTION 1 btint4cmp (int4, int4),
FUNCTION 2 gin_extract_trgm (text, internal),
! FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
! FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
STORAGE int4;
*** a/contrib/pg_trgm/trgm.h
--- b/contrib/pg_trgm/trgm.h
***************
*** 19,24 ****
--- 19,26 ----
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
+ #define LikeStrategyNumber 3
+ #define ILikeStrategyNumber 4
typedef char trgm[3];
***************
*** 53,59 **** typedef struct
/* gist */
#define BITBYTE 8
! #define SIGLENINT 3 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
--- 55,61 ----
/* gist */
#define BITBYTE 8
! #define SIGLENINT 15 /* >122 => key will toast, so very slow!!! */
#define SIGLEN ( sizeof(int)*SIGLENINT )
#define SIGLENBIT (SIGLEN*BITBYTE - 1) /* see makesign */
***************
*** 89,94 **** typedef char *BITVECP;
--- 91,101 ----
extern float4 trgm_limit;
TRGM *generate_trgm(char *str, int slen);
+ TRGM *generate_wildcard_trgm(char *str, int slen);
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
+ bool trgm_contain(TRGM *trg1, TRGM *trg2);
+
+ #define ISESCAPECHAR(x) (*(x) == '\\')
+ #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')
#endif /* __TRGM_H__ */
*** a/contrib/pg_trgm/trgm_gin.c
--- b/contrib/pg_trgm/trgm_gin.c
***************
*** 6,11 ****
--- 6,12 ----
#include "trgm.h"
#include "access/gin.h"
+ #include "access/skey.h"
#include "access/itup.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
***************
*** 16,21 ****
--- 17,25 ----
PG_FUNCTION_INFO_V1(gin_extract_trgm);
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
+ PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
+ Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
+
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
***************
*** 58,90 **** gin_extract_trgm(PG_FUNCTION_ARGS)
}
Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! /* StrategyNumber strategy = PG_GETARG_UINT16(1); */
/* text *query = PG_GETARG_TEXT_P(2); */
! int32 nkeys = PG_GETARG_INT32(3);
! /* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int32 i,
! ntrue = 0;
/* All cases served by this function are inexact */
*recheck = true;
! /* Count the matches */
! for (i = 0; i < nkeys; i++)
{
! if (check[i])
! ntrue++;
! }
#ifdef DIVUNION
! res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
#endif
!
PG_RETURN_BOOL(res);
}
--- 62,173 ----
}
Datum
+ gin_extract_query_trgm(PG_FUNCTION_ARGS)
+ {
+ text *val = (text *) PG_GETARG_TEXT_P(0);
+ int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = PG_GETARG_UINT16(2);
+ Datum *entries = NULL;
+ TRGM *trg;
+ int4 trglen;
+ int32 **extra_data = (int32 **) PG_GETARG_POINTER(4);
+ int32 *searchMode = (int32 *)PG_GETARG_POINTER(6);
+ trgm *ptr;
+ int4 i = 0,
+ item;
+
+ switch (strategy)
+ {
+ case SimilarityStrategyNumber:
+ trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", strategy);
+ trg = NULL; /* keep compiler quiet */
+ break;
+ }
+ trglen = ARRNELEM(trg);
+
+ *nentries = (int32) trglen;
+
+ if (trglen > 0)
+ {
+ entries = (Datum *) palloc(sizeof(Datum) * trglen);
+ ptr = GETARR(trg);
+ while (ptr - GETARR(trg) < ARRNELEM(trg))
+ {
+ item = trgm2int(ptr);
+ entries[i++] = Int32GetDatum(item);
+
+ ptr++;
+ }
+ }
+
+ *extra_data = (int32 *) palloc0(sizeof(int32));
+ **extra_data = trglen;
+
+ if (trglen == 0)
+ *searchMode = GIN_SEARCH_MODE_ALL;
+
+ PG_RETURN_POINTER(entries);
+ }
+
+ Datum
gin_trgm_consistent(PG_FUNCTION_ARGS)
{
bool *check = (bool *) PG_GETARG_POINTER(0);
! StrategyNumber strategy = PG_GETARG_UINT16(1);
/* text *query = PG_GETARG_TEXT_P(2); */
! /* int32 nkeys = PG_GETARG_INT32(3); */
! int32 *extra_data = (int32 *) PG_GETARG_POINTER(4);
bool *recheck = (bool *) PG_GETARG_POINTER(5);
bool res = FALSE;
int32 i,
! ntrue = 0,
! trglen;
+ #ifndef IGNORECASE
+ if (strategy == ILIKE_STRATEGY)
+ {
+ elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
+ }
+ #endif
/* All cases served by this function are inexact */
*recheck = true;
! trglen = *extra_data;
!
! switch (strategy)
{
! case SimilarityStrategyNumber:
! for (i = 0; i < trglen; i++)
! if (check[i])
! ntrue++;
#ifdef DIVUNION
! res = (trglen == ntrue) ? true : ((((((float4) ntrue) / ((float4) (trglen - ntrue)))) >= trgm_limit) ? true : false);
#else
! res = (trglen == 0) ? false : ((((((float4) ntrue) / ((float4) trglen))) >= trgm_limit) ? true : false);
#endif
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! res = true;
! for (i = 0; i < trglen; i++)
! if (!check[i])
! {
! res = false;
! break;
! }
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! res = false; /* keep compiler quiet */
! break;
! }
PG_RETURN_BOOL(res);
}
*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 195,225 **** gtrgm_consistent(PG_FUNCTION_ARGS)
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra;
!
! /* All cases served by this function are exact */
! *recheck = false;
! if (cache == NULL || VARSIZE(cache) != VARSIZE(query) || memcmp(cache, query, VARSIZE(query)) != 0)
{
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
! memcpy(cache, query, VARSIZE(query));
! memcpy(cache + MAXALIGN(VARSIZE(query)), qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cache + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
--- 195,248 ----
TRGM *key = (TRGM *) DatumGetPointer(entry->key);
TRGM *qtrg;
bool res;
! char *cache = (char *) fcinfo->flinfo->fn_extra,
! *cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! #ifndef IGNORECASE
! if (strategy == ILIKE_STRATEGY)
! {
! elog(ERROR, "Can't do ILIKE_STRATEGY with case-sensetive trigrams.");
! }
! #endif
! if (cache == NULL || strategy != *((StrategyNumber *)cache) ||
! VARSIZE(cacheContents) != VARSIZE(query) ||
! memcmp(cacheContents, query, VARSIZE(query)) != 0)
{
! switch (strategy)
! {
! case SimilarityStrategyNumber:
! qtrg = generate_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! case LikeStrategyNumber:
! case ILikeStrategyNumber:
! qtrg = generate_wildcard_trgm(VARDATA(query), VARSIZE(query) - VARHDRSZ);
! break;
! default:
! elog(ERROR, "unrecognized strategy number: %d", strategy);
! qtrg = NULL; /* keep compiler quiet */
! break;
! }
if (cache)
pfree(cache);
fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
! MAXALIGN(sizeof(StrategyNumber)) + MAXALIGN(VARSIZE(query)) + VARSIZE(qtrg));
cache = (char *) fcinfo->flinfo->fn_extra;
+ cacheContents = cache + MAXALIGN(sizeof(StrategyNumber));
! memcpy(cache, &strategy, sizeof(StrategyNumber));
! memcpy(cacheContents, query, VARSIZE(query));
! memcpy(cacheContents + MAXALIGN(VARSIZE(query)),
! qtrg, VARSIZE(qtrg));
}
! qtrg = (TRGM *) (cacheContents + MAXALIGN(VARSIZE(query)));
switch (strategy)
{
case SimilarityStrategyNumber:
+ *recheck = false;
if (GIST_LEAF(entry))
{ /* all leafs contains orig trgm */
float4 tmpsml = cnt_sml(key, qtrg);
***************
*** 242,247 **** gtrgm_consistent(PG_FUNCTION_ARGS)
--- 265,298 ----
res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
}
break;
+ case LikeStrategyNumber:
+ case ILikeStrategyNumber:
+ *recheck = true;
+ if (GIST_LEAF(entry))
+ { /* all leafs contains orig trgm */
+ res = trgm_contain(qtrg, key);
+ }
+ else if (ISALLTRUE(key))
+ { /* non-leaf contains signature */
+ res = true;
+ }
+ else
+ { /* non-leaf contains signature */
+ int4 k, tmp = 0, len = ARRNELEM(qtrg);
+ trgm *ptr = GETARR(qtrg);
+ BITVECP sign = GETSIGN(key);
+ res = true;
+ for (k = 0; k < len; k++)
+ {
+ CPTRGM(((char *) &tmp), ptr + k);
+ if (!GETBIT(sign, HASHVAL(tmp)))
+ {
+ res = false;
+ break;
+ }
+ }
+ }
+ break;
default:
elog(ERROR, "unrecognized strategy number: %d", strategy);
res = false; /* keep compiler quiet */
*** a/contrib/pg_trgm/trgm_op.c
--- b/contrib/pg_trgm/trgm_op.c
***************
*** 236,241 **** generate_trgm(char *str, int slen)
--- 236,404 ----
return trg;
}
+ static char *
+ get_wildcard_part(char *str, int lenstr, char *buf, int *bytelen, int *charlen)
+ {
+ char *beginword = str, *endword, *s = buf;
+ bool in_wildcard = false, in_escape = false;
+ int clen;
+
+ while (beginword - str < lenstr)
+ {
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(beginword)) break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(beginword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(beginword))
+ in_wildcard = true;
+ else if (iswordchr(beginword))
+ break;
+ else
+ in_wildcard = false;
+ }
+ beginword += pg_mblen(beginword);
+ }
+ *charlen = 0;
+ if (!in_wildcard)
+ {
+ if (LPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (LPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ if (beginword - str >= lenstr)
+ return NULL;
+
+ endword = beginword;
+ in_wildcard = false;
+ in_escape = false;
+ while (endword - str < lenstr)
+ {
+ clen = pg_mblen(endword);
+ if (in_escape)
+ {
+ in_escape = false;
+ in_wildcard = false;
+ if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ break;
+ }
+ else
+ {
+ if (ISESCAPECHAR(endword))
+ in_escape = true;
+ else if (ISWILDCARDCHAR(endword))
+ {
+ in_wildcard = true;
+ break;
+ }
+ else if (iswordchr(endword))
+ {
+ (*charlen)++;
+ memcpy(s, endword, clen);
+ s += clen;
+ }
+ else
+ {
+ in_wildcard = false;
+ break;
+ }
+ }
+ endword += clen;
+ }
+ if (!in_wildcard)
+ {
+ if (RPADDING > 0)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ if (RPADDING > 1)
+ {
+ *s++ = ' ';
+ (*charlen)++;
+ }
+ }
+ }
+ *bytelen = s - buf;
+ return endword;
+ }
+
+ TRGM *
+ generate_wildcard_trgm(char *str, int slen)
+ {
+ TRGM *trg;
+ char *buf,
+ *buf2;
+ trgm *tptr;
+ int len,
+ charlen,
+ bytelen;
+ char *eword;
+
+ trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
+ trg->flag = ARRKEY;
+ SET_VARSIZE(trg, TRGMHDRSIZE);
+
+ if (slen + LPADDING + RPADDING < 3 || slen == 0)
+ return trg;
+
+ tptr = GETARR(trg);
+
+ buf = palloc(sizeof(char) * (slen + 4));
+
+ eword = str;
+ while ((eword = get_wildcard_part(eword, slen - (eword - str),
+ buf, &bytelen, &charlen)) != NULL)
+ {
+ #ifdef IGNORECASE
+ buf2 = lowerstr_with_len(buf, bytelen);
+ bytelen = strlen(buf2);
+ #else
+ buf2 = buf;
+ #endif
+
+ /*
+ * count trigrams
+ */
+ tptr = make_trigrams(tptr, buf, bytelen, charlen);
+ #ifdef IGNORECASE
+ pfree(buf2);
+ #endif
+ }
+
+ pfree(buf);
+
+ if ((len = tptr - GETARR(trg)) == 0)
+ return trg;
+
+ if (len > 0)
+ {
+ qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
+ len = unique_array(GETARR(trg), len);
+ }
+
+ SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
+
+ return trg;
+ }
+
uint32
trgm2int(trgm *ptr)
{
***************
*** 340,345 **** cnt_sml(TRGM *trg1, TRGM *trg2)
--- 503,544 ----
}
+ bool
+ trgm_contain(TRGM *trg1, TRGM *trg2)
+ {
+ trgm *ptr1,
+ *ptr2;
+ int count = 0;
+ int len1,
+ len2;
+
+ ptr1 = GETARR(trg1);
+ ptr2 = GETARR(trg2);
+
+ len1 = ARRNELEM(trg1);
+ len2 = ARRNELEM(trg2);
+
+ while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
+ {
+ int res = CMPTRGM(ptr1, ptr2);
+
+ if (res < 0)
+ return false;
+ else if (res > 0)
+ ptr2++;
+ else
+ {
+ ptr1++;
+ ptr2++;
+ count++;
+ }
+ }
+ if (ptr1 - GETARR(trg1) < len1)
+ return false;
+ else
+ return true;
+ }
+
PG_FUNCTION_INFO_V1(similarity);
Datum similarity(PG_FUNCTION_ARGS);
Datum
*** a/contrib/pg_trgm/uninstall_pg_trgm.sql
--- b/contrib/pg_trgm/uninstall_pg_trgm.sql
***************
*** 27,35 **** DROP OPERATOR CLASS gin_trgm_ops USING gin;
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_trgm(text, internal, int2, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal);
DROP OPERATOR % (text, text);
--- 27,35 ----
DROP FUNCTION gin_extract_trgm(text, internal);
! DROP FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal);
! DROP FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal);
DROP OPERATOR % (text, text);