diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c new file mode 100644 index c4dfdf2..72949e4 *** a/contrib/pg_trgm/trgm_regexp.c --- b/contrib/pg_trgm/trgm_regexp.c *************** *** 122,130 **** * thousands of trigrams would be slow, and would likely produce so many * false positives that we would have to traverse a large fraction of the * index, the graph is simplified further in a lossy fashion by removing ! * color trigrams until the number of trigrams after expansion is below ! * the MAX_TRGM_COUNT threshold. When a color trigram is removed, the states ! * connected by any arcs labelled with that trigram are merged. * * 4) Pack the graph into a compact representation * ----------------------------------------------- --- 122,142 ---- * thousands of trigrams would be slow, and would likely produce so many * false positives that we would have to traverse a large fraction of the * index, the graph is simplified further in a lossy fashion by removing ! * color trigrams. Also, trigrams itself have not equivalent value: some of ! * them are more frequent and some of them are less frequent. Wishfully, we ! * would like to know the distribution of trigrams, but we don't. But because ! * of padding we for sure know that empty character is more frequent than ! * others. Assuming that we divide trigrams into classes according to presence ! * of whitespace. Their average frequencies relative to average frequency ! * of trigrams without whitespace are calculated by real-life texts, called ! * classes penalties and stored into "penalties" array. Then we introduce ! * penalty of color trigram which is multiplication of number of its trigrams ! * count and its class penalty. While removing color trigrams we're trying to ! * minimize it's summary penalty, but the restriction is total number of ! * trigrams. We would like to achieve WISH_TRGM_PENALTY summary penalty and ! * assume fail if we don't reach MAX_TRGM_COUNT threshold for total number of ! * trigrams. When a color trigram is removed, the states connected by any arcs ! * labelled with that trigram are merged. * * 4) Pack the graph into a compact representation * ----------------------------------------------- *************** *** 199,211 **** --- 211,240 ---- * MAX_EXPANDED_STATES - How many states we allow in expanded graph * MAX_EXPANDED_ARCS - How many arcs we allow in expanded graph * MAX_TRGM_COUNT - How many simple trigrams we allow to be extracted + * WISH_TRGM_PENALTY - Desired sum of color trigrams penalties * COLOR_COUNT_LIMIT - Maximum number of characters per color */ #define MAX_EXPANDED_STATES 128 #define MAX_EXPANDED_ARCS 1024 #define MAX_TRGM_COUNT 256 + #define WISH_TRGM_PENALTY 16 #define COLOR_COUNT_LIMIT 256 + /* + * Penalty of trigrams depending on whitespace presence. Based on analysis of + * real-life texts. + */ + const float4 penalties[8] = { + 1.0, /* "aaa" */ + 3.5, /* "aa " */ + 0.0, /* "a a" (impossible) */ + 0.0, /* "a " (impossible) */ + 4.2, /* " aa" */ + 2.1, /* " a " */ + 25.0, /* " a" */ + 1.0 /* " " (impossible) */ + }; + /* Struct representing a single pg_wchar, converted back to multibyte form */ typedef struct { *************** typedef struct *** 339,344 **** --- 368,374 ---- ColorTrgm ctrgm; int number; int count; + float4 penalty; bool expanded; List *arcs; } ColorTrgmInfo; *************** static TRGM *expandColorTrigrams(TrgmNFA *** 459,465 **** static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); static void mergeStates(TrgmState *state1, TrgmState *state2); static int colorTrgmInfoCmp(const void *p1, const void *p2); ! static int colorTrgmInfoCountCmp(const void *p1, const void *p2); static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); static int packArcInfoCmp(const void *a1, const void *a2); --- 489,495 ---- static void fillTrgm(trgm *ptrgm, trgm_mb_char s[3]); static void mergeStates(TrgmState *state1, TrgmState *state2); static int colorTrgmInfoCmp(const void *p1, const void *p2); ! static int colorTrgmInfoPenaltyCmp(const void *p1, const void *p2); static TrgmPackedGraph *packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext); static int packArcInfoCmp(const void *a1, const void *a2); *************** selectColorTrigrams(TrgmNFA *trgmNFA) *** 1424,1429 **** --- 1454,1460 ---- TrgmState *state; ColorTrgmInfo *colorTrgms; int64 totalTrgmCount; + float4 totalTrgmPenalty; int number; /* Collect color trigrams from all arcs */ *************** selectColorTrigrams(TrgmNFA *trgmNFA) *** 1490,1528 **** * overflow an int, so we use int64 for that within this routine. */ totalTrgmCount = 0; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; int j, ! count = 1; for (j = 0; j < 3; j++) { TrgmColor c = trgmInfo->ctrgm.colors[j]; ! if (c != COLOR_BLANK) count *= trgmNFA->colorInfo[c].wordCharsCount; } trgmInfo->count = count; totalTrgmCount += count; } ! /* Sort color trigrams in descending order of simple trigram counts */ qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), ! colorTrgmInfoCountCmp); /* ! * Remove color trigrams from the graph so long as total number of simple ! * trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams ! * with the most associated simple trigrams, since those are the most ! * promising for reducing the total number of simple trigrams. When ! * removing a color trigram we have to merge states connected by arcs ! * labeled with that trigram. It's necessary to not merge initial and ! * final states, because our graph becomes useless if that happens; so we ! * cannot always remove the trigram we'd prefer to. */ for (i = 0; ! (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT); i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; --- 1521,1568 ---- * overflow an int, so we use int64 for that within this routine. */ totalTrgmCount = 0; + totalTrgmPenalty = 0.0f; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; int j, ! count = 1, ! typeIndex = 0; for (j = 0; j < 3; j++) { TrgmColor c = trgmInfo->ctrgm.colors[j]; ! typeIndex *= 2; ! if (c == COLOR_BLANK) ! typeIndex++; ! else count *= trgmNFA->colorInfo[c].wordCharsCount; } trgmInfo->count = count; + trgmInfo->penalty = (float4)count * penalties[typeIndex]; totalTrgmCount += count; + totalTrgmPenalty += trgmInfo->penalty; + } ! /* Sort color trigrams in descending order of their penalties */ qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), ! colorTrgmInfoPenaltyCmp); /* ! * Remove color trigrams from the graph so long as total penalty of color ! * trigrams exceeds WISH_TRGM_PENALTY. But if we can't reach ! * WISH_TRGM_PENALTY it's OK to be in MAX_TRGM_COUNT. We prefer to remove ! * color trigrams with higher penalty, since those are the most promising ! * for reducing the total penalty. When removing a color trigram we have ! * to merge states connected by arcs labeled with that trigram. It's ! * necessary to not merge initial and final states, because our graph ! * becomes useless if that happens; so we cannot always remove the trigram ! * we'd prefer to. */ for (i = 0; ! (i < trgmNFA->colorTrgmsCount) && (totalTrgmPenalty > WISH_TRGM_PENALTY); i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; *************** selectColorTrigrams(TrgmNFA *trgmNFA) *** 1572,1585 **** /* Mark trigram unexpanded, and update totalTrgmCount */ trgmInfo->expanded = false; totalTrgmCount -= trgmInfo->count; } ! /* Did we succeed in fitting into MAX_TRGM_COUNT? */ if (totalTrgmCount > MAX_TRGM_COUNT) return false; ! trgmNFA->totalTrgmCount = (int) totalTrgmCount; /* * Sort color trigrams by colors (will be useful for bsearch in packGraph) --- 1612,1626 ---- /* Mark trigram unexpanded, and update totalTrgmCount */ trgmInfo->expanded = false; + totalTrgmPenalty -= trgmInfo->penalty; totalTrgmCount -= trgmInfo->count; } ! /* Did we succeed in fitting into MAX_TRGM_SIZE? */ if (totalTrgmCount > MAX_TRGM_COUNT) return false; ! trgmNFA->totalTrgmCount = totalTrgmCount; /* * Sort color trigrams by colors (will be useful for bsearch in packGraph) *************** colorTrgmInfoCmp(const void *p1, const v *** 1749,1768 **** * their simple trigrams counts. */ static int ! colorTrgmInfoCountCmp(const void *p1, const void *p2) { ! const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; ! const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; ! if (c1->count < c2->count) return 1; ! else if (c1->count == c2->count) return 0; else return -1; } - /*--------------------- * Subroutines for packing the graph into final representation (stage 4). *--------------------- --- 1790,1808 ---- * their simple trigrams counts. */ static int ! colorTrgmInfoPenaltyCmp(const void *p1, const void *p2) { ! float4 penalty1 = ((const ColorTrgmInfo *) p1)->penalty; ! float4 penalty2 = ((const ColorTrgmInfo *) p2)->penalty; ! if (penalty1 < penalty2) return 1; ! else if (penalty1 == penalty2) return 0; else return -1; } /*--------------------- * Subroutines for packing the graph into final representation (stage 4). *---------------------