diff -cpr DropMBMatchTextIC/src/backend/utils/adt/like.c UTF8MatchText/src/backend/utils/adt/like.c *** DropMBMatchTextIC/src/backend/utils/adt/like.c Mon Apr 9 11:22:26 2007 --- UTF8MatchText/src/backend/utils/adt/like.c Mon Apr 9 11:23:37 2007 *************** static text *do_like_escape(text *, text *** 36,41 **** --- 36,42 ---- static int MBMatchText(char *t, int tlen, char *p, int plen); static text *MB_do_like_escape(text *, text *); + static int UTF8MatchText(char *t, int tlen, char *p, int plen); static int GenericMatchText(char *s, int slen, char* p, int plen); static int mbtexticlike(text *str, text *pat); *************** MatchTextIC(char *t, int tlen, char *p, *** 189,199 **** --- 190,299 ---- #include "like_match.c" + /* Set up for utf8 characters */ + #define CHAREQ(p1, p2) wchareq(p1, p2) + #define NextChar(p, plen) \ + do { int __l = pg_utf_mblen(p); (p) +=__l; (plen) -=__l; } while (0) + + /* + * UTF8MatchText -- specialized version of MBMatchText for UTF8 + */ + static int + UTF8MatchText(char *t, int tlen, char *p, int plen) + { + /* Fast path for match-everything pattern */ + if ((plen == 1) && (*p == '%')) + return LIKE_TRUE; + + while ((tlen > 0) && (plen > 0)) + { + if (*p == '\\') + { + /* Next pattern char must match literally, whatever it is */ + NextByte(p, plen); + if ((plen <= 0) || !CHAREQ(t, p)) + return LIKE_FALSE; + } + else if (*p == '%') + { + /* %% is the same as % according to the SQL standard */ + /* Advance past all %'s */ + while ((plen > 0) && (*p == '%')) + NextByte(p, plen); + /* Trailing percent matches everything. */ + if (plen <= 0) + return LIKE_TRUE; + + /* + * Otherwise, scan for a text position at which we can match the + * rest of the pattern. + */ + while (tlen > 0) + { + /* + * Optimization to prevent most recursion: don't recurse + * unless first pattern char might match this text char. + */ + if (CHAREQ(t, p) || (*p == '\\') || (*p == '_')) + { + int matched = UTF8MatchText(t, tlen, p, plen); + + if (matched != LIKE_FALSE) + return matched; /* TRUE or ABORT */ + } + + NextChar(t, tlen); + } + + /* + * End of text with no match, so no point in trying later places + * to start matching this pattern. + */ + return LIKE_ABORT; + } + else if (*p == '_') + { + NextChar(t, tlen); + NextByte(p, plen); + continue; + } + else if (!BYTEEQ(t, p)) + { + /* + * Not the single-character wildcard and no explicit match? Then + * time to quit... + */ + return LIKE_FALSE; + } + + NextByte(t, tlen); + NextByte(p, plen); + } + + if (tlen > 0) + return LIKE_FALSE; /* end of pattern, but not of text */ + + /* End of input string. Do we have matching pattern remaining? */ + while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of + * pattern */ + NextByte(p, plen); + if (plen <= 0) + return LIKE_TRUE; + + /* + * End of text with no match, so no point in trying later places to start + * matching this pattern. + */ + return LIKE_ABORT; + } + static __inline__ int GenericMatchText(char *s, int slen, char* p, int plen) { if (pg_database_encoding_max_length() == 1) return MatchText(s, slen, p, plen); + else if (GetDatabaseEncoding() == PG_UTF8) + return UTF8MatchText(s, slen, p, plen); else return MBMatchText(s, slen, p, plen); } *************** mbtexticlike(text *str, text *pat) *** 214,220 **** p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! return MBMatchText(s, slen, p, plen); } /* And some support for BYTEA */ --- 314,323 ---- p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); ! if (GetDatabaseEncoding() == PG_UTF8) ! return UTF8MatchText(s, slen, p, plen); ! else ! return MBMatchText(s, slen, p, plen); } /* And some support for BYTEA */