*** varlena.c 2008-08-31 20:31:56.000000000 +0100 --- varlena.c 2008-09-07 02:47:52.000000000 +0100 *************** *** 40,45 **** --- 40,47 ---- pg_wchar *wstr2; /* note: these are palloc'd */ int len1; /* string lengths in logical characters */ int len2; + int skiptable[256]; /* for Boyer-Moore-Horspool searching, must be 256 elements. */ + int skiptablemask; /* mask for ANDing with skiptable */ } TextPositionState; #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X)) *************** *** 795,800 **** --- 797,803 ---- return result; } + /* * text_position_setup, text_position_next, text_position_cleanup - * Component steps of text_position() *************** *** 804,811 **** * called multiple times with increasing values of start_pos, which is * the 1-based character position to start the search from. The "state" * variable is normally just a local variable in the caller. */ - static void text_position_setup(text *t1, text *t2, TextPositionState *state) { --- 807,815 ---- * called multiple times with increasing values of start_pos, which is * the 1-based character position to start the search from. The "state" * variable is normally just a local variable in the caller. + * + * David Rowley: 2008-09-06 -- Updated for Boyer-Moore-Horspool searching */ static void text_position_setup(text *t1, text *t2, TextPositionState *state) { *************** *** 838,902 **** state->len1 = len1; state->len2 = len2; } ! } ! ! static int text_position_next(int start_pos, TextPositionState *state) { ! int pos = 0, ! p, ! px; ! ! Assert(start_pos > 0); /* else caller error */ ! ! if (state->len2 <= 0) ! return start_pos; /* result for empty pattern */ ! ! if (!state->use_wchar) ! { ! /* simple case - single byte encoding */ ! char *p1 = state->str1; ! char *p2 = state->str2; ! ! /* no use in searching str past point where search_str will fit */ ! px = (state->len1 - state->len2); ! ! p1 += start_pos - 1; ! ! for (p = start_pos - 1; p <= px; p++) ! { ! if ((*p1 == *p2) && (strncmp(p1, p2, state->len2) == 0)) ! { ! pos = p + 1; ! break; ! } ! p1++; ! } ! } ! else ! { ! /* not as simple - multibyte encoding */ ! pg_wchar *p1 = state->wstr1; ! pg_wchar *p2 = state->wstr2; ! ! /* no use in searching str past point where search_str will fit */ ! px = (state->len1 - state->len2); - p1 += start_pos - 1; - for (p = start_pos - 1; p <= px; p++) - { - if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, state->len2) == 0)) - { - pos = p + 1; - break; - } - p1++; - } - } - return pos; - } static void text_position_cleanup(TextPositionState *state) --- 842,1005 ---- state->len1 = len1; state->len2 = len2; } ! /* ! * If the needle is bigger than the haystack then there ! * is no point in wasting cycles initalizing anything else. ! */ ! if (state->len2 < state->len1) ! { ! int ai; ! int tablemask; ! int lneedle; ! ! /* Prepare the skip table for Boyer-Moore-Horspool searching. First we ! * must determine how much of the skip table to use. The declaration of ! * TextPositionState allows up to 256 elements, but with haystack lengths ! * of only a few characters we don't really want to have to initialize so ! * many elements --- it would take too long in comparison to the actual ! * search time. So we choose a useful skip table size based on the ! * haystack length minus the needle length. The closer the needle length ! * is to the haystack length the less useful skipping becomes. ! * ! * Note: tablemask must be N^2-1 and no more than 255. ! */ ! if ((state->len1 - state->len2) < 16) ! tablemask = 3; ! else if ((state->len1 - state->len2) < 64) ! tablemask = 7; ! else if ((state->len1 - state->len2) < 128) ! tablemask = 15; ! else if ((state->len1 - state->len2) < 512) ! tablemask = 31; ! else if ((state->len1 - state->len2) < 2048) ! tablemask = 63; ! else if ((state->len1 - state->len2) < 4096) ! tablemask = 127; ! else ! tablemask = 255; ! ! lneedle = state->len2; ! state->skiptablemask = tablemask; ! ! /* Initalize the skip table. We set all elements to the needle length */ ! for (ai = 0; ai <= tablemask; ai++) ! state->skiptable[ai] = lneedle; ! ! /* We do not process the last character in the needle */ ! lneedle--; ! ! if (!state->use_wchar) ! { ! for (ai = 0; ai < lneedle; ai++) ! state->skiptable[(unsigned char) state->str2[ai] & tablemask] = lneedle - ai; ! } ! else ! { ! for (ai = 0; ai < lneedle; ai++) ! state->skiptable[state->wstr2[ai] & tablemask] = lneedle - ai; ! } ! } ! } ! ! /* text_position_next ! * David Rowley 2008-09-06 ! * Uses Boyer-Moore-Horspool searching ! */ ! int text_position_next(int start_pos, TextPositionState *state) { ! int haystack_len = state->len1; ! int needle_len = state->len2; ! int skipmask = state->skiptablemask; ! ! Assert(start_pos > 0); /* else caller error */ ! ! if (needle_len <= 0) ! return start_pos; /* Empty needle */ ! ! start_pos--; /* Change for zero based arrays */ ! /* ! * Eliminate the impossible first. The needle is ! * too big for the haystack. We've likely not setup ! * the skip table when this happens anyway. ! */ ! if (haystack_len + start_pos < needle_len) ! return 0; ! ! Assert((skipmask & 255) == skipmask && skipmask >= 3); /* Must be N^2-1, min size 3 */ ! ! if (!state->use_wchar) ! { ! char *nptr; ! char *hptr; ! char *p; ! char *needle = state->str2; ! char *haystack = state->str1; ! ! /* Start at startpos plus the length of the needle */ ! hptr = &haystack[needle_len - 1 + start_pos]; ! while (hptr < &haystack[haystack_len]) ! { ! nptr = &needle[needle_len - 1]; /* Point to the end of the needle */ ! p = hptr; ! ! while (*nptr == *p) ! { ! /* Do we have it? Return 1 based array pos */ ! if (nptr-- == needle) ! return p - haystack + 1; ! p--; ! } ! /* Ask the skiptable where to look next. If it ! * finds a match then we align the two ~matching~ ! * characters and start another search there. ! * Else we skip a whole needle length. ! * Of course the ~matching~ characters only have the ! * same hash value, the character value may be different. ! */ ! hptr += state->skiptable[(unsigned char) *hptr & skipmask]; ! } ! return 0; /* Not found */ ! } ! else ! { ! /* The multibyte char version. This works exactly the same way. */ ! pg_wchar *nptr; ! pg_wchar *hptr; ! pg_wchar *p; ! pg_wchar *needle = state->wstr2; ! pg_wchar *haystack = state->wstr1; ! ! /* Start at start_pos plus the length of the needle */ ! hptr = &haystack[needle_len - 1 + start_pos]; ! while (hptr < &haystack[haystack_len]) ! { ! nptr = &needle[needle_len - 1]; /* Point to the end of the needle */ ! p = hptr; ! ! while (*nptr == *p) ! { ! /* Do we have it? Return 1 based array pos */ ! if (nptr-- == needle) ! return p - haystack + 1; ! p--; ! } ! /* Ask the skiptable where to look next. If it ! * finds a match then we align the two ~matching~ ! * characters and start another search there. ! * Else we skip a whole needle length. ! * Of course the ~matching~ characters only have the ! * same hash value, the character value may be different. ! */ ! hptr += state->skiptable[*hptr & skipmask]; ! } ! return 0; /* Not found */ ! } ! } ! static void text_position_cleanup(TextPositionState *state)