Remove 1MB size limit in tsvector
Hello, hackers!
Historically tsvector type can't hold more than 1MB data.
I want to propose a patch that removes that limit.
That limit is created by 'pos' field from WordEntry, which have only
20 bits for storage.
In the proposed patch I removed this field and instead of it I keep
offsets only at each Nth item in WordEntry's array. Now I set N as 4,
because it gave best results in my benchmarks. It can be increased in
the future without affecting already saved data in database. Also
removing the field improves compression of tsvectors.
I simplified the code by creating functions that can be used to
build tsvectors. There were duplicated code fragments in places where
tsvector was built.
Also new patch frees some space in WordEntry that can be used to
save some additional information about saved words.
-
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
Attachments:
tsvector_stretched_v1.patchtext/x-patchDownload
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index 34fe4c5b3c..9585a25003 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -26,7 +26,7 @@ DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES))
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
- to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o
+ to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o ts_compat.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 35d9ab276c..d66b69baea 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -156,13 +156,10 @@ TSVector
make_tsvector(ParsedText *prs)
{
int i,
- j,
lenstr = 0,
- totallen;
+ totallen,
+ stroff = 0;
TSVector in;
- WordEntry *ptr;
- char *str;
- int stroff;
/* Merge duplicate words */
if (prs->curwords > 0)
@@ -171,12 +168,8 @@ make_tsvector(ParsedText *prs)
/* Determine space needed */
for (i = 0; i < prs->curwords; i++)
{
- lenstr += prs->words[i].len;
- if (prs->words[i].alen)
- {
- lenstr = SHORTALIGN(lenstr);
- lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
- }
+ int npos = prs->words[i].alen ? prs->words[i].pos.apos[0] : 0;
+ INCRSIZE(lenstr, i, prs->words[i].len, npos);
}
if (lenstr > MAXSTRPOS)
@@ -187,41 +180,21 @@ make_tsvector(ParsedText *prs)
totallen = CALCDATASIZE(prs->curwords, lenstr);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
- in->size = prs->curwords;
+ TS_SETCOUNT(in, prs->curwords);
- ptr = ARRPTR(in);
- str = STRPTR(in);
- stroff = 0;
for (i = 0; i < prs->curwords; i++)
{
- ptr->len = prs->words[i].len;
- ptr->pos = stroff;
- memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
- stroff += prs->words[i].len;
- pfree(prs->words[i].word);
+ int npos = 0;
if (prs->words[i].alen)
- {
- int k = prs->words[i].pos.apos[0];
- WordEntryPos *wptr;
+ npos = prs->words[i].pos.apos[0];
- if (k > 0xFFFF)
- elog(ERROR, "positions array too long");
+ tsvector_addlexeme(in, i, &stroff, prs->words[i].word, prs->words[i].len,
+ prs->words[i].pos.apos + 1, npos);
- ptr->haspos = 1;
- stroff = SHORTALIGN(stroff);
- *(uint16 *) (str + stroff) = (uint16) k;
- wptr = POSDATAPTR(in, ptr);
- for (j = 0; j < k; j++)
- {
- WEP_SETWEIGHT(wptr[j], 0);
- WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
- }
- stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
+ pfree(prs->words[i].word);
+ if (prs->words[i].alen)
pfree(prs->words[i].pos.apos);
- }
- else
- ptr->haspos = 0;
- ptr++;
+
}
if (prs->words)
@@ -251,7 +224,6 @@ to_tsvector_byid(PG_FUNCTION_ARGS)
PG_FREE_IF_COPY(in, 1);
out = make_tsvector(&prs);
-
PG_RETURN_TSVECTOR(out);
}
diff --git a/src/backend/tsearch/ts_compat.c b/src/backend/tsearch/ts_compat.c
new file mode 100644
index 0000000000..bb2a62eaf7
--- /dev/null
+++ b/src/backend/tsearch/ts_compat.c
@@ -0,0 +1,83 @@
+#include "postgres.h"
+#include "tsearch/ts_type.h"
+
+/*
+ * Definition of old WordEntry struct in TSVector. Because of limitations
+ * in size (max 1MB for lexemes), the format has changed
+ */
+typedef struct
+{
+ uint32
+ haspos:1,
+ len:11,
+ pos:20;
+} OldWordEntry;
+
+typedef struct
+{
+ uint16 npos;
+ WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
+} OldWordEntryPosVector;
+
+#define OLDSTRPTR(x) ( (char *) &(x)->entries[x->size_] )
+#define _OLDPOSVECPTR(x, e) \
+ ((OldWordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
+#define OLDPOSDATALEN(x,e) ( ( (e)->haspos ) ? (_OLDPOSVECPTR(x,e)->npos) : 0 )
+#define OLDPOSDATAPTR(x,e) (_OLDPOSVECPTR(x,e)->pos)
+
+/*
+ * Converts tsvector with the old structure to current.
+ * @orig - tsvector to convert,
+ * @copy - return copy of tsvector, it has a meaning when tsvector doensn't
+ * need to be converted.
+ */
+TSVector
+tsvector_upgrade(Datum orig, bool copy)
+{
+ int i,
+ dataoff = 0,
+ datalen = 0,
+ totallen;
+ TSVector in,
+ out;
+
+ in = (TSVector) PG_DETOAST_DATUM(orig);
+
+ /* If already in new format, return as is */
+ if (in->size_ & TS_FLAG_STRETCHED)
+ {
+ TSVector out;
+
+ if (!copy)
+ return in;
+
+ out = (TSVector) palloc(VARSIZE(in));
+ memcpy(out, in, VARSIZE(in));
+ return out;
+ }
+
+ /*
+ * Calculate required size.
+ * We don't check any sizes here because old format was limited with 1MB
+ */
+ for (i = 0; i < in->size_; i++)
+ {
+ OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+ INCRSIZE(datalen, i, entry->len, OLDPOSDATALEN(in, entry));
+ }
+
+ totallen = CALCDATASIZE(in->size_, datalen);
+ out = (TSVector) palloc0(totallen);
+ SET_VARSIZE(out, totallen);
+ TS_SETCOUNT(out, in->size_);
+
+ for (i = 0; i < in->size_; i++)
+ {
+ OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+ tsvector_addlexeme(out, i, &dataoff,
+ OLDSTRPTR(in) + entry->pos, entry->len,
+ OLDPOSDATAPTR(in, entry), OLDPOSDATALEN(in, entry));
+ }
+
+ return out;
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 320c7f1a61..9b2fc4be04 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -202,7 +202,8 @@ compute_tsvector_stats(VacAttrStats *stats,
TSVector vector;
WordEntry *curentryptr;
char *lexemesptr;
- int j;
+ int j,
+ pos;
vacuum_delay_point();
@@ -236,7 +237,9 @@ compute_tsvector_stats(VacAttrStats *stats,
*/
lexemesptr = STRPTR(vector);
curentryptr = ARRPTR(vector);
- for (j = 0; j < vector->size; j++)
+
+ INITPOS(pos);
+ for (j = 0; j < TS_COUNT(vector); j++)
{
bool found;
@@ -246,8 +249,8 @@ compute_tsvector_stats(VacAttrStats *stats,
* make a copy of it. This way we can free the tsvector value
* once we've processed all its lexemes.
*/
- hash_key.lexeme = lexemesptr + curentryptr->pos;
- hash_key.length = curentryptr->len;
+ hash_key.lexeme = lexemesptr + pos;
+ hash_key.length = ENTRY_LEN(vector, curentryptr);
/* Lookup current lexeme in hashtable, adding it if new */
item = (TrackItem *) hash_search(lexemes_tab,
@@ -280,7 +283,7 @@ compute_tsvector_stats(VacAttrStats *stats,
}
/* Advance to the next WordEntry in the tsvector */
- curentryptr++;
+ INCRPTR(vector, curentryptr, pos);
}
/* If the vector was toasted, free the detoasted copy. */
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 83a939dfd5..fc39dfcb3f 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -67,23 +67,26 @@ gin_extract_tsvector(PG_FUNCTION_ARGS)
TSVector vector = PG_GETARG_TSVECTOR(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL;
+ int tscount = TS_COUNT(vector);
- *nentries = vector->size;
- if (vector->size > 0)
+ *nentries = tscount;
+ if (tscount > 0)
{
int i;
- WordEntry *we = ARRPTR(vector);
+ uint32 pos;
- entries = (Datum *) palloc(sizeof(Datum) * vector->size);
+ WordEntry *we = ARRPTR(vector);
+ entries = (Datum *) palloc(sizeof(Datum) * tscount);
- for (i = 0; i < vector->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < tscount; i++)
{
text *txt;
- txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
+ txt = cstring_to_text_with_len(STRPTR(vector) + pos,
+ ENTRY_LEN(vector, we));
entries[i] = PointerGetDatum(txt);
-
- we++;
+ INCRPTR(vector, we, pos);
}
}
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 7ce2699b5c..18d3de3725 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -192,28 +192,33 @@ gtsvector_compress(PG_FUNCTION_ARGS)
int32 *arr;
WordEntry *ptr = ARRPTR(val);
char *words = STRPTR(val);
+ const int tscount = TS_COUNT(val);
+ uint32 pos;
- len = CALCGTSIZE(ARRKEY, val->size);
+ len = CALCGTSIZE(ARRKEY, tscount);
res = (SignTSVector *) palloc(len);
SET_VARSIZE(res, len);
res->flag = ARRKEY;
arr = GETARR(res);
- len = val->size;
+ len = tscount;
+
+ INITPOS(pos);
while (len--)
{
pg_crc32 c;
INIT_LEGACY_CRC32(c);
- COMP_LEGACY_CRC32(c, words + ptr->pos, ptr->len);
+ COMP_LEGACY_CRC32(c, words + pos, ENTRY_LEN(val, ptr));
FIN_LEGACY_CRC32(c);
*arr = *(int32 *) &c;
arr++;
- ptr++;
+
+ INCRPTR(val, ptr, pos);
}
- len = uniqueint(GETARR(res), val->size);
- if (len != val->size)
+ len = uniqueint(GETARR(res), tscount);
+ if (len != tscount)
{
/*
* there is a collision of hash-function; len is always less than
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 4577bcc0b8..26252ca353 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -53,43 +53,38 @@ word_distance(int32 w)
static int
cnt_length(TSVector t)
{
- WordEntry *ptr = ARRPTR(t),
- *end = (WordEntry *) STRPTR(t);
- int len = 0;
+ int i,
+ len = 0;
- while (ptr < end)
+ for (i = 0; i < TS_COUNT(t); i++)
{
- int clen = POSDATALEN(t, ptr);
-
- if (clen == 0)
- len += 1;
- else
- len += clen;
-
- ptr++;
+ WordEntry *entry = UNWRAP_ENTRY(t, ARRPTR(t) + i);
+ Assert(!entry->hasoff);
+ len += (entry->npos == 0) ? 1 : entry->npos;
}
return len;
}
-#define WordECompareQueryItem(e,q,p,i,m) \
- tsCompareString((q) + (i)->distance, (i)->length, \
- (e) + (p)->pos, (p)->len, (m))
-
-
/*
* Returns a pointer to a WordEntry's array corresponding to 'item' from
* tsvector 't'. 'q' is the TSQuery containing 'item'.
* Returns NULL if not found.
*/
-static WordEntry *
+static int
find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
{
- WordEntry *StopLow = ARRPTR(t);
- WordEntry *StopHigh = (WordEntry *) STRPTR(t);
- WordEntry *StopMiddle = StopHigh;
- int difference;
+#define WordECompareQueryItem(s,l,q,i,m) \
+ tsCompareString((q) + (i)->distance, (i)->length, \
+ s, l, (m))
+
+ int StopLow = 0;
+ int StopHigh = TS_COUNT(t);
+ int StopMiddle = StopHigh;
+ int difference;
+ char *lexeme;
+ WordEntry *we;
*nitem = 0;
@@ -97,7 +92,12 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
- difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
+ lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+ Assert(!we->hasoff);
+ difference = WordECompareQueryItem(lexeme, we->len,
+ GETOPERAND(q), item, false);
+
if (difference == 0)
{
StopHigh = StopMiddle;
@@ -117,18 +117,22 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
*nitem = 0;
- while (StopMiddle < (WordEntry *) STRPTR(t) &&
- WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
+ while (StopMiddle < TS_COUNT(t))
{
+ lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+ Assert(!we->hasoff);
+ if (WordECompareQueryItem(lexeme, we->len, GETOPERAND(q), item, true) != 0)
+ break;
+
(*nitem)++;
StopMiddle++;
}
}
- return (*nitem > 0) ? StopHigh : NULL;
+ return (*nitem > 0) ? StopHigh : -1;
}
-
/*
* sort QueryOperands by (length, word)
*/
@@ -200,15 +204,13 @@ SortAndUniqItems(TSQuery q, int *size)
static float
calc_rank_and(const float *w, TSVector t, TSQuery q)
{
- WordEntryPosVector **pos;
- WordEntryPosVector1 posnull;
- WordEntryPosVector *POSNULL;
+ WordEntryPos **pos;
+ uint16 *npos;
+ WordEntryPos posnull[1] = {0};
int i,
k,
l,
p;
- WordEntry *entry,
- *firstentry;
WordEntryPos *post,
*ct;
int32 dimt,
@@ -225,41 +227,55 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
pfree(item);
return calc_rank_or(w, t, q);
}
- pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
+ pos = (WordEntryPos **) palloc0(sizeof(WordEntryPos *) * q->size);
+ npos = (uint16 *) palloc0(sizeof(uint16) * q->size);
- /* A dummy WordEntryPos array to use when haspos is false */
- posnull.npos = 1;
- posnull.pos[0] = 0;
- WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
- POSNULL = (WordEntryPosVector *) &posnull;
+ /* posnull is a dummy WordEntryPos array to use when npos == 0 */
+ WEP_SETPOS(posnull[0], MAXENTRYPOS - 1);
for (i = 0; i < size; i++)
{
- firstentry = entry = find_wordentry(t, q, item[i], &nitem);
- if (!entry)
+ int idx = find_wordentry(t, q, item[i], &nitem),
+ firstidx;
+
+ if (idx == -1)
continue;
- while (entry - firstentry < nitem)
+ firstidx = idx;
+
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
- pos[i] = _POSVECPTR(t, entry);
+ WordEntry *entry;
+
+ char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
+ {
+ pos[i] = POSDATAPTR(lexeme, entry->len);
+ npos[i] = entry->npos;
+ }
else
- pos[i] = POSNULL;
+ {
+ pos[i] = posnull;
+ npos[i] = 1;
+ }
+
+ post = pos[i];
+ dimt = npos[i];
- dimt = pos[i]->npos;
- post = pos[i]->pos;
for (k = 0; k < i; k++)
{
if (!pos[k])
continue;
- lenct = pos[k]->npos;
- ct = pos[k]->pos;
+ lenct = npos[k];
+ ct = pos[k];
for (l = 0; l < dimt; l++)
{
for (p = 0; p < lenct; p++)
{
dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
- if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
+ if (dist || (dist == 0 && (pos[i] == posnull || pos[k] == posnull)))
{
float curw;
@@ -272,10 +288,11 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
}
}
- entry++;
+ idx++;
}
}
pfree(pos);
+ pfree(npos);
pfree(item);
return res;
}
@@ -283,9 +300,8 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
static float
calc_rank_or(const float *w, TSVector t, TSQuery q)
{
- WordEntry *entry,
- *firstentry;
- WordEntryPosVector1 posnull;
+ /* A dummy WordEntryPos array to use when lexeme hasn't positions */
+ WordEntryPos posnull[1] = {0};
WordEntryPos *post;
int32 dimt,
j,
@@ -295,33 +311,36 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
QueryOperand **item;
int size = q->size;
- /* A dummy WordEntryPos array to use when haspos is false */
- posnull.npos = 1;
- posnull.pos[0] = 0;
-
item = SortAndUniqItems(q, &size);
for (i = 0; i < size; i++)
{
+ int idx , firstidx;
float resj,
wjm;
- int32 jm;
+ int32 jm;
- firstentry = entry = find_wordentry(t, q, item[i], &nitem);
- if (!entry)
+ idx = find_wordentry(t, q, item[i], &nitem);
+ if (idx == -1)
continue;
- while (entry - firstentry < nitem)
+ firstidx = idx;
+
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
+ WordEntry *entry;
+ char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
{
- dimt = POSDATALEN(t, entry);
- post = POSDATAPTR(t, entry);
+ dimt = entry->npos;
+ post = POSDATAPTR(lexeme, entry->len);
}
else
{
- dimt = posnull.npos;
- post = posnull.pos;
+ dimt = 1;
+ post = posnull;
}
resj = 0.0;
@@ -345,7 +364,7 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
*/
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
- entry++;
+ idx++;
}
}
if (size > 0)
@@ -361,7 +380,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
float res = 0.0;
int len;
- if (!t->size || !q->size)
+ if (!TS_COUNT(t) || !q->size)
return 0.0;
/* XXX: What about NOT? */
@@ -373,7 +392,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
if (res < 0)
res = 1e-20f;
- if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
+ if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(t) > 0)
res /= log((double) (cnt_length(t) + 1)) / log(2.0);
if (method & RANK_NORM_LENGTH)
@@ -385,11 +404,11 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
/* RANK_NORM_EXTDIST not applicable */
- if ((method & RANK_NORM_UNIQ) && t->size > 0)
- res /= (float) (t->size);
+ if ((method & RANK_NORM_UNIQ) && TS_COUNT(t) > 0)
+ res /= (float) (TS_COUNT(t));
- if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
- res /= log((double) (t->size + 1)) / log(2.0);
+ if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(t) > 0)
+ res /= log((double) (TS_COUNT(t) + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
res /= (res + 1);
@@ -504,13 +523,13 @@ typedef struct
struct
{ /* compiled doc representation */
QueryItem **items;
- int16 nitem;
+ int32 nitem;
} query;
struct
{ /* struct is used for preparing doc
* representation */
QueryItem *item;
- WordEntry *entry;
+ int32 idx;
} map;
} data;
WordEntryPos pos;
@@ -526,10 +545,10 @@ compareDocR(const void *va, const void *vb)
{
if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
{
- if (a->data.map.entry == b->data.map.entry)
+ if (a->data.map.idx == b->data.map.idx)
return 0;
- return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
+ return (a->data.map.idx > b->data.map.idx) ? 1 : -1;
}
return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
@@ -724,9 +743,6 @@ static DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
- WordEntry *entry,
- *firstentry;
- WordEntryPos *post;
int32 dimt, /* number of 'post' items */
j,
i,
@@ -743,29 +759,38 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
*/
for (i = 0; i < qr->query->size; i++)
{
- QueryOperand *curoperand;
+ int idx,
+ firstidx;
+ QueryOperand *curoperand;
+ WordEntryPos *post;
if (item[i].type != QI_VAL)
continue;
curoperand = &item[i].qoperand;
- firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
- if (!entry)
+ idx = find_wordentry(txt, qr->query, curoperand, &nitem);
+ if (idx < 0)
continue;
+ firstidx = idx;
+
/* iterations over entries in tsvector */
- while (entry - firstentry < nitem)
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
+ WordEntry *entry;
+ char *lex = tsvector_getlexeme(txt, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
{
- dimt = POSDATALEN(txt, entry);
- post = POSDATAPTR(txt, entry);
+ dimt = entry->npos;
+ post = POSDATAPTR(lex, entry->len);
}
else
{
/* ignore words without positions */
- entry++;
+ idx++;
continue;
}
@@ -782,13 +807,12 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
{
doc[cur].pos = post[j];
- doc[cur].data.map.entry = entry;
+ doc[cur].data.map.idx = idx;
doc[cur].data.map.item = (QueryItem *) curoperand;
cur++;
}
}
-
- entry++;
+ idx++;
}
}
@@ -814,7 +838,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
while (rptr - doc < cur)
{
if (rptr->pos == (rptr - 1)->pos &&
- rptr->data.map.entry == (rptr - 1)->data.map.entry)
+ rptr->data.map.idx == (rptr - 1)->data.map.idx)
{
storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
storage.data.query.nitem++;
@@ -917,7 +941,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
NExtent++;
}
- if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
+ if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(txt) > 0)
Wdoc /= log((double) (cnt_length(txt) + 1));
if (method & RANK_NORM_LENGTH)
@@ -930,11 +954,11 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
Wdoc /= ((double) NExtent) / SumDist;
- if ((method & RANK_NORM_UNIQ) && txt->size > 0)
- Wdoc /= (double) (txt->size);
+ if ((method & RANK_NORM_UNIQ) && TS_COUNT(txt) > 0)
+ Wdoc /= (double) (TS_COUNT(txt));
- if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
- Wdoc /= log((double) (txt->size + 1)) / log(2.0);
+ if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(txt) > 0)
+ Wdoc /= log((double) (TS_COUNT(txt) + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
Wdoc /= (Wdoc + 1);
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 6f66c1f58c..57f1de8d14 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -22,9 +22,9 @@
typedef struct
{
- WordEntry entry; /* must be first! */
- WordEntryPos *pos;
- int poslen; /* number of elements in pos */
+ WordEntry entry; /* must be first! */
+ size_t offset; /* offset of lexeme in some buffer */
+ WordEntryPos *pos;
} WordEntryIN;
@@ -79,14 +79,30 @@ uniquePos(WordEntryPos *a, int l)
/* Compare two WordEntryIN values for qsort */
static int
-compareentry(const void *va, const void *vb, void *arg)
+compareentry_in(const void *va, const void *vb, void *arg)
{
const WordEntryIN *a = (const WordEntryIN *) va;
const WordEntryIN *b = (const WordEntryIN *) vb;
char *BufferStr = (char *) arg;
- return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
- &BufferStr[b->entry.pos], b->entry.len,
+ return tsCompareString(&BufferStr[a->offset], a->entry.len,
+ &BufferStr[b->offset], b->entry.len,
+ false);
+}
+
+/* Compare two WordEntry values for qsort */
+static int
+compareentry(const void *va, const void *vb, void *arg)
+{
+ const WordEntry *a = (const WordEntry *) va;
+ const WordEntry *b = (const WordEntry *) vb;
+ TSVector tsv = (TSVector) arg;
+
+ uint32 offset1 = tsvector_getoffset(tsv, a - ARRPTR(tsv), NULL),
+ offset2 = tsvector_getoffset(tsv, b - ARRPTR(tsv), NULL);
+
+ return tsCompareString(STRPTR(tsv) + offset1, ENTRY_LEN(tsv, a),
+ STRPTR(tsv) + offset2, ENTRY_LEN(tsv, b),
false);
}
@@ -97,14 +113,15 @@ compareentry(const void *va, const void *vb, void *arg)
static int
uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
{
- int buflen;
+ int buflen,
+ i = 0;
WordEntryIN *ptr,
*res;
Assert(l >= 1);
if (l > 1)
- qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
+ qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry_in,
(void *) buf);
buflen = 0;
@@ -112,67 +129,76 @@ uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
ptr = a + 1;
while (ptr - a < l)
{
+ Assert(!ptr->entry.hasoff);
+
if (!(ptr->entry.len == res->entry.len &&
- strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
- res->entry.len) == 0))
+ strncmp(&buf[ptr->offset], &buf[res->offset], res->entry.len) == 0))
{
/* done accumulating data into *res, count space needed */
+ buflen = SHORTALIGN(buflen);
+ if (i++ % TS_OFFSET_STRIDE == 0)
+ {
+ buflen = INTALIGN(buflen);
+ buflen += sizeof(WordEntry);
+ }
+
buflen += res->entry.len;
- if (res->entry.haspos)
+ if (res->entry.npos)
{
- res->poslen = uniquePos(res->pos, res->poslen);
+ res->entry.npos = uniquePos(res->pos, res->entry.npos);
buflen = SHORTALIGN(buflen);
- buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ buflen += res->entry.npos * sizeof(WordEntryPos);
}
res++;
if (res != ptr)
- memcpy(res, ptr, sizeof(WordEntryIN));
+ *res = *ptr;
}
- else if (ptr->entry.haspos)
+ else if (ptr->entry.npos)
{
- if (res->entry.haspos)
+ if (res->entry.npos)
{
/* append ptr's positions to res's positions */
- int newlen = ptr->poslen + res->poslen;
+ int newlen = ptr->entry.npos + res->entry.npos;
res->pos = (WordEntryPos *)
repalloc(res->pos, newlen * sizeof(WordEntryPos));
- memcpy(&res->pos[res->poslen], ptr->pos,
- ptr->poslen * sizeof(WordEntryPos));
- res->poslen = newlen;
+ memcpy(&res->pos[res->entry.npos], ptr->pos,
+ ptr->entry.npos * sizeof(WordEntryPos));
+ res->entry.npos = newlen;
pfree(ptr->pos);
}
else
{
/* just give ptr's positions to pos */
- res->entry.haspos = 1;
+ res->entry.npos = ptr->entry.npos;
res->pos = ptr->pos;
- res->poslen = ptr->poslen;
}
}
ptr++;
}
/* count space needed for last item */
+ if (i % TS_OFFSET_STRIDE == 0)
+ {
+ buflen = INTALIGN(buflen);
+ buflen += sizeof(WordEntry);
+ }
+ else
+ buflen = SHORTALIGN(buflen);
+
buflen += res->entry.len;
- if (res->entry.haspos)
+
+ if (res->entry.npos)
{
- res->poslen = uniquePos(res->pos, res->poslen);
+ res->entry.npos = uniquePos(res->pos, res->entry.npos);
buflen = SHORTALIGN(buflen);
- buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ buflen += res->entry.npos * sizeof(WordEntryPos);
}
*outbuflen = buflen;
- return res + 1 - a;
+ return res + 1 -a;
}
-static int
-WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
-{
- return compareentry(a, b, buf);
-}
-
-
Datum
tsvectorin(PG_FUNCTION_ARGS)
{
@@ -181,7 +207,6 @@ tsvectorin(PG_FUNCTION_ARGS)
WordEntryIN *arr;
int totallen;
int arrlen; /* allocated size of arr */
- WordEntry *inarr;
int len = 0;
TSVector in;
int i;
@@ -189,7 +214,6 @@ tsvectorin(PG_FUNCTION_ARGS)
int toklen;
WordEntryPos *pos;
int poslen;
- char *strbuf;
int stroff;
/*
@@ -238,23 +262,13 @@ tsvectorin(PG_FUNCTION_ARGS)
tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
cur = tmpbuf + dist;
}
+ arr[len].entry.hasoff = 0;
arr[len].entry.len = toklen;
- arr[len].entry.pos = cur - tmpbuf;
+ arr[len].offset = cur - tmpbuf;
+ arr[len].entry.npos = poslen;
+ arr[len].pos = (poslen != 0)? pos : NULL;
memcpy((void *) cur, (void *) token, toklen);
cur += toklen;
-
- if (poslen != 0)
- {
- arr[len].entry.haspos = 1;
- arr[len].pos = pos;
- arr[len].poslen = poslen;
- }
- else
- {
- arr[len].entry.haspos = 0;
- arr[len].pos = NULL;
- arr[len].poslen = 0;
- }
len++;
}
@@ -273,36 +287,18 @@ tsvectorin(PG_FUNCTION_ARGS)
totallen = CALCDATASIZE(len, buflen);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
- in->size = len;
- inarr = ARRPTR(in);
- strbuf = STRPTR(in);
+ TS_SETCOUNT(in, len);
stroff = 0;
for (i = 0; i < len; i++)
{
- memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
- arr[i].entry.pos = stroff;
- stroff += arr[i].entry.len;
- if (arr[i].entry.haspos)
- {
- if (arr[i].poslen > 0xFFFF)
- elog(ERROR, "positions array too long");
-
- /* Copy number of positions */
- stroff = SHORTALIGN(stroff);
- *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
- stroff += sizeof(uint16);
-
- /* Copy positions */
- memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
- stroff += arr[i].poslen * sizeof(WordEntryPos);
+ tsvector_addlexeme(in, i, &stroff, &tmpbuf[arr[i].offset],
+ arr[i].entry.len, arr[i].pos, arr[i].entry.npos);
+ if (arr[i].entry.npos)
pfree(arr[i].pos);
- }
- inarr[i] = arr[i].entry;
}
- Assert((strbuf + stroff - (char *) in) == totallen);
-
+ Assert((STRPTR(in) + stroff - (char *) in) == totallen);
PG_RETURN_TSVECTOR(in);
}
@@ -313,28 +309,36 @@ tsvectorout(PG_FUNCTION_ARGS)
char *outbuf;
int32 i,
lenbuf = 0,
- pp;
+ pp,
+ tscount = TS_COUNT(out);
+ uint32 pos;
WordEntry *ptr = ARRPTR(out);
char *curbegin,
*curin,
*curout;
- lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
- for (i = 0; i < out->size; i++)
+ lenbuf = tscount * 2 /* '' */ + tscount - 1 /* space */ + 2 /* \0 */ ;
+ for (i = 0; i < tscount; i++)
{
- lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
- if (ptr[i].haspos)
- lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+ int npos = ENTRY_NPOS(out, ptr + i);
+ lenbuf += ENTRY_LEN(out, ptr + i) * 2 * pg_database_encoding_max_length() /* for escape */ ;
+ if (npos)
+ lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * npos;
}
curout = outbuf = (char *) palloc(lenbuf);
- for (i = 0; i < out->size; i++)
+
+ INITPOS(pos);
+ for (i = 0; i < tscount; i++)
{
- curbegin = curin = STRPTR(out) + ptr->pos;
+ int lex_len = ENTRY_LEN(out, ptr),
+ npos = ENTRY_NPOS(out, ptr);
+
+ curbegin = curin = STRPTR(out) + pos;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
- while (curin - curbegin < ptr->len)
+ while (curin - curbegin < lex_len)
{
int len = pg_mblen(curin);
@@ -348,12 +352,12 @@ tsvectorout(PG_FUNCTION_ARGS)
}
*curout++ = '\'';
- if ((pp = POSDATALEN(out, ptr)) != 0)
+ if ((pp = npos) != 0)
{
WordEntryPos *wptr;
*curout++ = ':';
- wptr = POSDATAPTR(out, ptr);
+ wptr = POSDATAPTR(curbegin, lex_len);
while (pp)
{
curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
@@ -379,7 +383,8 @@ tsvectorout(PG_FUNCTION_ARGS)
wptr++;
}
}
- ptr++;
+
+ INCRPTR(out, ptr, pos);
}
*curout = '\0';
@@ -406,35 +411,38 @@ tsvectorsend(PG_FUNCTION_ARGS)
StringInfoData buf;
int i,
j;
+ uint32 pos;
WordEntry *weptr = ARRPTR(vec);
pq_begintypsend(&buf);
+ pq_sendint(&buf, TS_COUNT(vec), sizeof(int32));
- pq_sendint(&buf, vec->size, sizeof(int32));
- for (i = 0; i < vec->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(vec); i++)
{
- uint16 npos;
+ char *lexeme = STRPTR(vec) + pos;
+ int npos = ENTRY_NPOS(vec, weptr),
+ lex_len = ENTRY_LEN(vec, weptr);
/*
* the strings in the TSVector array are not null-terminated, so we
* have to send the null-terminator separately
*/
- pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+ pq_sendtext(&buf, lexeme, lex_len);
pq_sendbyte(&buf, '\0');
-
- npos = POSDATALEN(vec, weptr);
pq_sendint(&buf, npos, sizeof(uint16));
if (npos > 0)
{
- WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+ WordEntryPos *wepptr = POSDATAPTR(lexeme, lex_len);
for (j = 0; j < npos; j++)
pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
}
- weptr++;
+ INCRPTR(vec, weptr, pos);
}
+ PG_FREE_IF_COPY(vec, 0);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
@@ -443,14 +451,16 @@ tsvectorrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSVector vec;
- int i;
- int32 nentries;
- int datalen; /* number of bytes used in the variable size
+ int i,
+ datalen; /* number of bytes used in the variable size
* area after fixed size TSVector header and
* WordEntries */
+ int32 nentries;
Size hdrlen;
Size len; /* allocated size of vec */
bool needSort = false;
+ char *prev_lexeme = NULL;
+ int prev_lex_len;
nentries = pq_getmsgint(buf, sizeof(int32));
if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
@@ -460,16 +470,17 @@ tsvectorrecv(PG_FUNCTION_ARGS)
len = hdrlen * 2; /* times two to make room for lexemes */
vec = (TSVector) palloc0(len);
- vec->size = nentries;
+ TS_SETCOUNT(vec, nentries);
datalen = 0;
for (i = 0; i < nentries; i++)
{
- const char *lexeme;
+ char *lexeme,
+ *lexeme_out;
uint16 npos;
- size_t lex_len;
+ int lex_len;
- lexeme = pq_getmsgstring(buf);
+ lexeme = (char *) pq_getmsgstring(buf);
npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
/* sanity checks */
@@ -489,62 +500,42 @@ tsvectorrecv(PG_FUNCTION_ARGS)
*
* But make sure the buffer is large enough first.
*/
- while (hdrlen + SHORTALIGN(datalen + lex_len) +
- (npos + 1) * sizeof(WordEntryPos) >= len)
+ while (hdrlen + SHORTALIGN(datalen + lex_len) + sizeof(WordEntry) +
+ npos * sizeof(WordEntryPos) >= len)
{
len *= 2;
vec = (TSVector) repalloc(vec, len);
}
- vec->entries[i].haspos = (npos > 0) ? 1 : 0;
- vec->entries[i].len = lex_len;
- vec->entries[i].pos = datalen;
-
- memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
-
- datalen += lex_len;
-
- if (i > 0 && WordEntryCMP(&vec->entries[i],
- &vec->entries[i - 1],
- STRPTR(vec)) <= 0)
+ if (prev_lexeme && tsCompareString(lexeme, lex_len,
+ prev_lexeme, prev_lex_len, false) <= 0)
needSort = true;
- /* Receive positions */
+ lexeme_out = tsvector_addlexeme(vec, i, &datalen, lexeme,
+ lex_len, NULL, npos);
if (npos > 0)
{
- uint16 j;
- WordEntryPos *wepptr;
-
- /*
- * Pad to 2-byte alignment if necessary. Though we used palloc0
- * for the initial allocation, subsequent repalloc'd memory areas
- * are not initialized to zero.
- */
- if (datalen != SHORTALIGN(datalen))
- {
- *(STRPTR(vec) + datalen) = '\0';
- datalen = SHORTALIGN(datalen);
- }
+ WordEntryPos *wepptr;
+ int j;
- memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
-
- wepptr = POSDATAPTR(vec, &vec->entries[i]);
+ wepptr = POSDATAPTR(lexeme_out, lex_len);
for (j = 0; j < npos; j++)
{
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
elog(ERROR, "position information is misordered");
}
-
- datalen += (npos + 1) * sizeof(WordEntry);
}
+
+ prev_lexeme = lexeme;
+ prev_lex_len = lex_len;
}
SET_VARSIZE(vec, hdrlen + datalen);
if (needSort)
- qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
- compareentry, (void *) STRPTR(vec));
+ qsort_arg((void *) ARRPTR(vec), TS_COUNT(vec), sizeof(WordEntry),
+ compareentry, (void *) vec);
PG_RETURN_TSVECTOR(vec);
}
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 822520299e..9f53aae357 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -33,10 +33,10 @@
typedef struct
{
- WordEntry *arrb;
- WordEntry *arre;
- char *values;
- char *operand;
+ TSVector vec;
+ int bidx;
+ int eidx;
+ char *operand;
} CHKVAL;
@@ -71,7 +71,7 @@ static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
/*
- * Order: haspos, len, word, for all positions (pos, weight)
+ * Order: npos, len, word, for all positions (pos, weight)
*/
static int
silly_cmp_tsvector(const TSVector a, const TSVector b)
@@ -80,9 +80,9 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
return -1;
else if (VARSIZE(a) > VARSIZE(b))
return 1;
- else if (a->size < b->size)
+ else if (TS_COUNT(a) < TS_COUNT(b))
return -1;
- else if (a->size > b->size)
+ else if (TS_COUNT(a) > TS_COUNT(b))
return 1;
else
{
@@ -90,28 +90,40 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
WordEntry *bptr = ARRPTR(b);
int i = 0;
int res;
+ uint32 pos1,
+ pos2;
+ INITPOS(pos1);
+ INITPOS(pos2);
- for (i = 0; i < a->size; i++)
+ for (i = 0; i < TS_COUNT(a); i++)
{
- if (aptr->haspos != bptr->haspos)
- {
- return (aptr->haspos > bptr->haspos) ? -1 : 1;
- }
- else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
+ char *lex1 = STRPTR(a) + pos1,
+ *lex2 = STRPTR(b) + pos2;
+ int npos1 = ENTRY_NPOS(a, aptr),
+ npos2 = ENTRY_NPOS(b, bptr);
+ int len1 = ENTRY_LEN(a, aptr),
+ len2 = ENTRY_LEN(b, bptr);
+
+ if ((npos1 == 0 || npos2 == 0) && npos1 != npos2)
+ return npos1 > npos2? -1 : 1;
+ else if ((res = tsCompareString(lex1, len1, lex2, len2, false)) != 0)
{
return res;
}
- else if (aptr->haspos)
+ else if (npos1 > 0)
{
- WordEntryPos *ap = POSDATAPTR(a, aptr);
- WordEntryPos *bp = POSDATAPTR(b, bptr);
+ WordEntryPos *ap,
+ *bp;
int j;
- if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
- return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
+ ap = POSDATAPTR(lex1, len1);
+ bp = POSDATAPTR(lex2, len2);
+
+ if (npos1 != npos2)
+ return (npos1 > npos2) ? -1 : 1;
- for (j = 0; j < POSDATALEN(a, aptr); j++)
+ for (j = 0; j < npos1; j++)
{
if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
{
@@ -125,8 +137,8 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
}
}
- aptr++;
- bptr++;
+ INCRPTR(a, aptr, pos1);
+ INCRPTR(b, bptr, pos2);
}
}
@@ -161,27 +173,29 @@ tsvector_strip(PG_FUNCTION_ARGS)
TSVector in = PG_GETARG_TSVECTOR(0);
TSVector out;
int i,
+ count,
+ posout = 0,
+ pos,
len = 0;
- WordEntry *arrin = ARRPTR(in),
- *arrout;
- char *cur;
+ WordEntry *entryin = ARRPTR(in);
- for (i = 0; i < in->size; i++)
- len += arrin[i].len;
+ count = TS_COUNT(in);
+ for (i = 0; i < count; i++)
+ INCRSIZE(len, i, ENTRY_LEN(in, ARRPTR(in) + i), 0);
- len = CALCDATASIZE(in->size, len);
+ len = CALCDATASIZE(count, len);
out = (TSVector) palloc0(len);
SET_VARSIZE(out, len);
- out->size = in->size;
- arrout = ARRPTR(out);
- cur = STRPTR(out);
- for (i = 0; i < in->size; i++)
+ TS_SETCOUNT(out, count);
+
+ INITPOS(pos);
+ for (i = 0; i < count; i++)
{
- memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
- arrout[i].haspos = 0;
- arrout[i].len = arrin[i].len;
- arrout[i].pos = cur - STRPTR(out);
- cur += arrout[i].len;
+ tsvector_addlexeme(out, i, &posout,
+ STRPTR(in) + pos, ENTRY_LEN(in, entryin),
+ NULL, 0);
+
+ INCRPTR(in, entryin, pos);
}
PG_FREE_IF_COPY(in, 0);
@@ -192,7 +206,7 @@ Datum
tsvector_length(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
- int32 ret = in->size;
+ int32 ret = TS_COUNT(in);
PG_FREE_IF_COPY(in, 0);
PG_RETURN_INT32(ret);
@@ -204,11 +218,10 @@ tsvector_setweight(PG_FUNCTION_ARGS)
TSVector in = PG_GETARG_TSVECTOR(0);
char cw = PG_GETARG_CHAR(1);
TSVector out;
- int i,
- j;
- WordEntry *entry;
- WordEntryPos *p;
+ int i;
+ WordEntry *weptr;
int w = 0;
+ uint32 pos;
switch (cw)
{
@@ -235,20 +248,21 @@ tsvector_setweight(PG_FUNCTION_ARGS)
out = (TSVector) palloc(VARSIZE(in));
memcpy(out, in, VARSIZE(in));
- entry = ARRPTR(out);
- i = out->size;
- while (i--)
+ weptr = ARRPTR(out);
+
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(out); i++)
{
- if ((j = POSDATALEN(out, entry)) != 0)
+ int j,
+ npos = ENTRY_NPOS(out, weptr);
+
+ if (npos)
{
- p = POSDATAPTR(out, entry);
- while (j--)
- {
- WEP_SETWEIGHT(*p, w);
- p++;
- }
+ WordEntryPos *p = POSDATAPTR(STRPTR(out) + pos, ENTRY_LEN(out, weptr));
+ for (j = 0; j < npos; j++)
+ WEP_SETWEIGHT(p[j], w);
}
- entry++;
+ INCRPTR(out, weptr, pos);
}
PG_FREE_IF_COPY(in, 0);
@@ -269,10 +283,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
TSVector tsout;
int i,
- j,
nlexemes,
weight;
- WordEntry *entry;
Datum *dlexemes;
bool *nulls;
@@ -301,8 +313,6 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
tsout = (TSVector) palloc(VARSIZE(tsin));
memcpy(tsout, tsin, VARSIZE(tsin));
- entry = ARRPTR(tsout);
-
deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
&dlexemes, &nulls, &nlexemes);
@@ -315,7 +325,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
{
char *lex;
int lex_len,
- lex_pos;
+ lex_idx,
+ npos;
if (nulls[i])
ereport(ERROR,
@@ -324,17 +335,19 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
- lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+ lex_idx = tsvector_bsearch(tsin, lex, lex_len);
+ npos = ENTRY_NPOS(tsin, ARRPTR(tsout) + lex_idx);
- if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ if (lex_idx >= 0 && npos > 0)
{
- WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ int j;
+ WordEntry *we;
+ char *lexeme = tsvector_getlexeme(tsout, lex_idx, &we);
- while (j--)
- {
- WEP_SETWEIGHT(*p, weight);
- p++;
- }
+ WordEntryPos *p = POSDATAPTR(lexeme, we->len);
+
+ for (j = 0; j < npos; j++)
+ WEP_SETWEIGHT(p[j], weight);
}
}
@@ -354,34 +367,27 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
* Return the number added (might be less than expected due to overflow)
*/
static int32
-add_pos(TSVector src, WordEntry *srcptr,
- TSVector dest, WordEntry *destptr,
+add_pos(char *src, WordEntry *srcptr,
+ WordEntryPos *dest, int from,
int32 maxpos)
{
- uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
+ uint16 clen = from;
int i;
- uint16 slen = POSDATALEN(src, srcptr),
- startlen;
- WordEntryPos *spos = POSDATAPTR(src, srcptr),
- *dpos = POSDATAPTR(dest, destptr);
+ uint16 slen = srcptr->npos;
+ WordEntryPos *spos = POSDATAPTR(src, srcptr->len);
- if (!destptr->haspos)
- *clen = 0;
-
- startlen = *clen;
+ Assert(!srcptr->hasoff);
for (i = 0;
- i < slen && *clen < MAXNUMPOS &&
- (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
+ i < slen && clen < MAXNUMPOS &&
+ (clen == 0 || WEP_GETPOS(dest[clen - 1]) != MAXENTRYPOS - 1);
i++)
{
- WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
- WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
- (*clen)++;
+ WEP_SETWEIGHT(dest[clen], WEP_GETWEIGHT(spos[i]));
+ WEP_SETPOS(dest[clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
+ clen++;
}
- if (*clen != startlen)
- destptr->haspos = 1;
- return *clen - startlen;
+ return clen - from;
}
/*
@@ -392,20 +398,20 @@ add_pos(TSVector src, WordEntry *srcptr,
static int
tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
{
- WordEntry *arrin = ARRPTR(tsv);
int StopLow = 0,
- StopHigh = tsv->size,
+ StopHigh = TS_COUNT(tsv),
StopMiddle,
cmp;
while (StopLow < StopHigh)
{
- StopMiddle = (StopLow + StopHigh) / 2;
+ WordEntry *entry = NULL;
+ char *str;
+ StopMiddle = (StopLow + StopHigh) / 2;
+ str = tsvector_getlexeme(tsv, StopMiddle, &entry);
cmp = tsCompareString(lexeme, lexeme_len,
- STRPTR(tsv) + arrin[StopMiddle].pos,
- arrin[StopMiddle].len,
- false);
+ str, entry->len, false);
if (cmp < 0)
StopHigh = StopMiddle;
@@ -460,14 +466,12 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
int indices_count)
{
TSVector tsout;
- WordEntry *arrin = ARRPTR(tsv),
- *arrout;
- char *data = STRPTR(tsv),
- *dataout;
- int i, /* index in arrin */
- j, /* index in arrout */
+ WordEntry *ptr = ARRPTR(tsv);
+ int i, /* index in input tsvector */
+ j, /* index in output tsvector */
k, /* index in indices_to_delete */
- curoff; /* index in dataout area */
+ curoff = 0, /* index in data area of output */
+ pos;
/*
* Sort the filter array to simplify membership checks below. Also, get
@@ -495,16 +499,18 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
tsout = (TSVector) palloc0(VARSIZE(tsv));
/* This count must be correct because STRPTR(tsout) relies on it. */
- tsout->size = tsv->size - indices_count;
+ TS_SETCOUNT(tsout, TS_COUNT(tsv) - indices_count);
/*
* Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
*/
- arrout = ARRPTR(tsout);
- dataout = STRPTR(tsout);
- curoff = 0;
- for (i = j = k = 0; i < tsv->size; i++)
+
+ INITPOS(pos);
+ for (i = j = k = 0; i < TS_COUNT(tsv); i++)
{
+ char *lex = STRPTR(tsv) + pos;
+ int lex_len = ENTRY_LEN(tsv, ptr);
+
/*
* If current i is present in indices_to_delete, skip this lexeme.
* Since indices_to_delete is already sorted, we only need to check
@@ -513,28 +519,14 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
if (k < indices_count && i == indices_to_delete[k])
{
k++;
- continue;
+ goto next;
}
- /* Copy lexeme and its positions and weights */
- memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
- arrout[j].haspos = arrin[i].haspos;
- arrout[j].len = arrin[i].len;
- arrout[j].pos = curoff;
- curoff += arrin[i].len;
- if (arrin[i].haspos)
- {
- int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
- + sizeof(uint16);
-
- curoff = SHORTALIGN(curoff);
- memcpy(dataout + curoff,
- STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
- len);
- curoff += len;
- }
+ tsvector_addlexeme(tsout, j++, &curoff, lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(tsv, ptr));
- j++;
+next:
+ INCRPTR(tsv, ptr, pos);
}
/*
@@ -543,8 +535,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
* estimation of tsout's size is wrong.
*/
Assert(k == indices_count);
-
- SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), curoff));
return tsout;
}
@@ -635,8 +626,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
Datum
tsvector_unnest(PG_FUNCTION_ARGS)
{
- FuncCallContext *funcctx;
- TSVector tsin;
+ FuncCallContext *funcctx;
+ TSVector tsin;
+ uint32 pos;
if (SRF_IS_FIRSTCALL())
{
@@ -655,31 +647,33 @@ tsvector_unnest(PG_FUNCTION_ARGS)
TEXTARRAYOID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
- funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+ INITPOS(pos);
+ funcctx->user_fctx = list_make2(PG_GETARG_TSVECTOR(0), makeInteger(pos));
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
- tsin = (TSVector) funcctx->user_fctx;
+ tsin = (TSVector) linitial(funcctx->user_fctx);
+ pos = intVal(lsecond(funcctx->user_fctx));
- if (funcctx->call_cntr < tsin->size)
+ if (funcctx->call_cntr < TS_COUNT(tsin))
{
- WordEntry *arrin = ARRPTR(tsin);
+ WordEntry *entry = ARRPTR(tsin) + funcctx->call_cntr;
char *data = STRPTR(tsin);
HeapTuple tuple;
int j,
- i = funcctx->call_cntr;
+ npos = ENTRY_NPOS(tsin, entry),
+ lex_len = ENTRY_LEN(tsin, entry);
bool nulls[] = {false, false, false};
Datum values[3];
values[0] = PointerGetDatum(
- cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
- );
+ cstring_to_text_with_len(data + pos, lex_len));
- if (arrin[i].haspos)
+ if (npos)
{
- WordEntryPosVector *posv;
+ WordEntryPos *apos = POSDATAPTR(data + pos, lex_len);
Datum *positions;
Datum *weights;
char weight;
@@ -689,28 +683,28 @@ tsvector_unnest(PG_FUNCTION_ARGS)
* uint16 (2 bits for weight, 14 for position). Here we extract
* that in two separate arrays.
*/
- posv = _POSVECPTR(tsin, arrin + i);
- positions = palloc(posv->npos * sizeof(Datum));
- weights = palloc(posv->npos * sizeof(Datum));
- for (j = 0; j < posv->npos; j++)
+ positions = palloc(npos * sizeof(Datum));
+ weights = palloc(npos * sizeof(Datum));
+ for (j = 0; j < npos; j++)
{
- positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
- weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ positions[j] = Int16GetDatum(WEP_GETPOS(apos[j]));
+ weight = 'D' - WEP_GETWEIGHT(apos[j]);
weights[j] = PointerGetDatum(
cstring_to_text_with_len(&weight, 1)
);
}
values[1] = PointerGetDatum(
- construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ construct_array(positions, npos, INT2OID, 2, true, 's'));
values[2] = PointerGetDatum(
- construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ construct_array(weights, npos, TEXTOID, -1, false, 'i'));
}
else
{
nulls[1] = nulls[2] = true;
}
+ INCRPTR(tsin, entry, intVal(lsecond(funcctx->user_fctx)));
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
}
@@ -728,21 +722,23 @@ Datum
tsvector_to_array(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0);
- WordEntry *arrin = ARRPTR(tsin);
+ WordEntry *entry = ARRPTR(tsin);
Datum *elements;
int i;
ArrayType *array;
+ long pos;
- elements = palloc(tsin->size * sizeof(Datum));
+ elements = palloc(TS_COUNT(tsin) * sizeof(Datum));
- for (i = 0; i < tsin->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(tsin); i++)
{
elements[i] = PointerGetDatum(
- cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
- );
+ cstring_to_text_with_len(STRPTR(tsin) + pos, ENTRY_LEN(tsin, entry)));
+ INCRPTR(tsin, entry, pos);
}
- array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ array = construct_array(elements, TS_COUNT(tsin), TEXTOID, -1, false, 'i');
pfree(elements);
PG_FREE_IF_COPY(tsin, 0);
@@ -750,6 +746,124 @@ tsvector_to_array(PG_FUNCTION_ARGS)
}
/*
+ * Returns offset by given index in TSVector,
+ * this function used when we need random access
+ */
+int
+tsvector_getoffset(TSVector vec, int idx, WordEntry **we)
+{
+ int offset = 0;
+ WordEntry *entry;
+
+ entry = ARRPTR(vec) + idx;
+ if (we)
+ *we = entry;
+
+ while (!entry->hasoff)
+ {
+ entry--;
+ if (!entry->hasoff)
+ offset += SHORTALIGN(entry->len) + entry->npos * sizeof(WordEntryPos);
+ }
+
+ Assert(entry >= ARRPTR(vec));
+
+ if (idx % TS_OFFSET_STRIDE)
+ {
+ /* if idx is by offset */
+ WordEntry *offset_entry = (WordEntry *) (STRPTR(vec) + entry->offset);
+
+ offset += entry->offset + sizeof(WordEntry);
+ offset += SHORTALIGN(offset_entry->len) + offset_entry->npos * sizeof(WordEntryPos);
+ }
+ else
+ {
+ Assert(entry == ARRPTR(vec) + idx);
+
+ if (we)
+ *we = (WordEntry *) (STRPTR(vec) + entry->offset);
+ offset = entry->offset + sizeof(WordEntry);
+ }
+
+ return offset;
+}
+
+/*
+ * Add lexeme and its positions to tsvector and move dataoff (offset where
+ * data should be added) to new position.
+ * Returns pointer to lexeme start
+ */
+char *
+tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+ char *lexeme, int lexeme_len, WordEntryPos *pos, int npos)
+{
+ int stroff;
+ WordEntry *entry;
+ char *result;
+
+ /* when idx is 0, dataoff should be 0 too, and otherwise */
+ Assert(!((idx == 0) ^ (*dataoff == 0)));
+
+ stroff = *dataoff;
+ entry = ARRPTR(tsv) + idx;
+
+ if (idx % TS_OFFSET_STRIDE == 0)
+ {
+ /* WordEntry with offset */
+ WordEntry offentry;
+
+ stroff = INTALIGN(stroff);
+ entry->hasoff = 1;
+ entry->offset = stroff;
+
+ /* fill WordEntry for offset */
+ offentry.hasoff = 0;
+ offentry.len = lexeme_len;
+ offentry.npos = npos;
+ memcpy(STRPTR(tsv) + stroff, &offentry, sizeof(WordEntry));
+ stroff += sizeof(WordEntry);
+ }
+ else
+ {
+ stroff = SHORTALIGN(stroff); \
+ entry->hasoff = 0;
+ entry->len = lexeme_len;
+ entry->npos = npos;
+ }
+
+ memcpy(STRPTR(tsv) + stroff, lexeme, lexeme_len);
+ result = STRPTR(tsv) + stroff;
+ stroff += lexeme_len;
+
+ if (npos)
+ {
+ if (npos > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
+ /*
+ * Pad to 2-byte alignment if necessary. We don't know how memory was
+ * allocated, so in case of aligning we need to make sure that unused
+ * is zero.
+ */
+ if (stroff != SHORTALIGN(stroff))
+ {
+ *(STRPTR(tsv) + stroff) = '\0';
+ stroff = SHORTALIGN(stroff);
+ }
+
+ /* Copy positions */
+ if (pos)
+ memcpy(STRPTR(tsv) + stroff, pos, npos * sizeof(WordEntryPos));
+
+ stroff += npos * sizeof(WordEntryPos);
+ }
+
+ *dataoff = stroff;
+
+ return result;
+}
+
+/*
* Build tsvector from array of lexemes.
*/
Datum
@@ -758,14 +872,13 @@ array_to_tsvector(PG_FUNCTION_ARGS)
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
TSVector tsout;
Datum *dlexemes;
- WordEntry *arrout;
bool *nulls;
int nitems,
i,
j,
tslen,
+ cur = 0,
datalen = 0;
- char *cur;
deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
@@ -793,26 +906,23 @@ array_to_tsvector(PG_FUNCTION_ARGS)
/* Calculate space needed for surviving lexemes. */
for (i = 0; i < nitems; i++)
- datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
+ {
+ int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+ INCRSIZE(datalen, i, lex_len, 0);
+ }
tslen = CALCDATASIZE(nitems, datalen);
/* Allocate and fill tsvector. */
tsout = (TSVector) palloc0(tslen);
SET_VARSIZE(tsout, tslen);
- tsout->size = nitems;
+ TS_SETCOUNT(tsout, nitems);
- arrout = ARRPTR(tsout);
- cur = STRPTR(tsout);
for (i = 0; i < nitems; i++)
{
char *lex = VARDATA(dlexemes[i]);
int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
- memcpy(cur, lex, lex_len);
- arrout[i].haspos = 0;
- arrout[i].len = lex_len;
- arrout[i].pos = cur - STRPTR(tsout);
- cur += lex_len;
+ tsvector_addlexeme(tsout, i, &cur, lex, lex_len, NULL, 0);
}
PG_FREE_IF_COPY(v, 0);
@@ -828,17 +938,16 @@ tsvector_filter(PG_FUNCTION_ARGS)
TSVector tsin = PG_GETARG_TSVECTOR(0),
tsout;
ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
- WordEntry *arrin = ARRPTR(tsin),
- *arrout;
- char *datain = STRPTR(tsin),
- *dataout;
+ char *dataout;
Datum *dweights;
bool *nulls;
int nweights;
int i,
- j;
- int cur_pos = 0;
+ j,
+ dataoff = 0,
+ pos;
char mask = 0;
+ WordEntry *ptr = ARRPTR(tsin);
deconstruct_array(weights, CHAROID, 1, true, 'c',
&dweights, &nulls, &nweights);
@@ -879,109 +988,112 @@ tsvector_filter(PG_FUNCTION_ARGS)
}
tsout = (TSVector) palloc0(VARSIZE(tsin));
- tsout->size = tsin->size;
- arrout = ARRPTR(tsout);
+ TS_SETCOUNT(tsout, TS_COUNT(tsin));
dataout = STRPTR(tsout);
- for (i = j = 0; i < tsin->size; i++)
+ INITPOS(pos);
+ for (i = j = 0; i < TS_COUNT(tsin); i++)
{
- WordEntryPosVector *posvin,
- *posvout;
- int npos = 0;
- int k;
-
- if (!arrin[i].haspos)
- continue;
-
- posvin = _POSVECPTR(tsin, arrin + i);
- posvout = (WordEntryPosVector *)
- (dataout + SHORTALIGN(cur_pos + arrin[i].len));
-
- for (k = 0; k < posvin->npos; k++)
+ WordEntryPos *posin,
+ *posout;
+ int k,
+ npos = 0,
+ lex_len = ENTRY_LEN(tsin, ptr);
+ char *lex = STRPTR(tsin) + pos,
+ *lexout;
+
+ posin = POSDATAPTR(lex, lex_len);
+ for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
{
- if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
- posvout->pos[npos++] = posvin->pos[k];
+ if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+ npos++;
}
- /* if no satisfactory positions found, skip lexeme */
if (!npos)
- continue;
+ goto next;
- arrout[j].haspos = true;
- arrout[j].len = arrin[i].len;
- arrout[j].pos = cur_pos;
+ lexout = tsvector_addlexeme(tsout, j++, &dataoff, lex, lex_len,
+ NULL, npos);
+ posout = POSDATAPTR(lexout, lex_len);
+ npos = 0;
+ for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+ posout[npos++] = posin[k];
+ }
- memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
- posvout->npos = npos;
- cur_pos += SHORTALIGN(arrin[i].len);
- cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
- sizeof(uint16);
- j++;
+next:
+ INCRPTR(tsin, ptr, pos);
}
- tsout->size = j;
+ TS_SETCOUNT(tsout, j);
if (dataout != STRPTR(tsout))
- memmove(STRPTR(tsout), dataout, cur_pos);
+ memmove(STRPTR(tsout), dataout, dataoff);
- SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+ SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), dataoff));
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(tsout);
}
+/* Get max position in in1; we'll need this to offset in2's positions */
+static int
+get_maxpos(TSVector tsv)
+{
+ int i,
+ j,
+ maxpos = 0;
+ WordEntry *ptr = ARRPTR(tsv);
+ uint32 pos;
+ WordEntryPos *apos;
+
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(tsv); i++)
+ {
+ apos = POSDATAPTR(STRPTR(tsv) + pos, ENTRY_LEN(tsv, ptr));
+ for (j = 0; j < ENTRY_NPOS(tsv, ptr); j++)
+ {
+ if (WEP_GETPOS(apos[j]) > maxpos)
+ maxpos = WEP_GETPOS(apos[j]);
+ }
+
+ INCRPTR(tsv, ptr, pos);
+ }
+
+ return maxpos;
+}
+
Datum
tsvector_concat(PG_FUNCTION_ARGS)
{
- TSVector in1 = PG_GETARG_TSVECTOR(0);
- TSVector in2 = PG_GETARG_TSVECTOR(1);
- TSVector out;
- WordEntry *ptr;
- WordEntry *ptr1,
+ TSVector in1 = PG_GETARG_TSVECTOR(0),
+ in2 = PG_GETARG_TSVECTOR(1),
+ out;
+ WordEntry *ptr,
+ *ptr1,
*ptr2;
- WordEntryPos *p;
int maxpos = 0,
i,
- j,
i1,
i2,
- dataoff,
output_bytes,
- output_size;
- char *data,
- *data1,
- *data2;
-
- /* Get max position in in1; we'll need this to offset in2's positions */
- ptr = ARRPTR(in1);
- i = in1->size;
- while (i--)
- {
- if ((j = POSDATALEN(in1, ptr)) != 0)
- {
- p = POSDATAPTR(in1, ptr);
- while (j--)
- {
- if (WEP_GETPOS(*p) > maxpos)
- maxpos = WEP_GETPOS(*p);
- p++;
- }
- }
- ptr++;
- }
+ pos1,
+ pos2,
+ dataoff;
+ char *data;
ptr1 = ARRPTR(in1);
ptr2 = ARRPTR(in2);
- data1 = STRPTR(in1);
- data2 = STRPTR(in2);
- i1 = in1->size;
- i2 = in2->size;
+ i1 = TS_COUNT(in1);
+ i2 = TS_COUNT(in2);
/*
* Conservative estimate of space needed. We might need all the data in
- * both inputs, and conceivably add a pad byte before position data for
- * each item where there was none before.
+ * both inputs, and conceivably add a pad bytes before lexeme and position
+ * data, and pad bytes before WordEntry for offset entry.
*/
- output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
+ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 * 2 + i2 * 2;
+ output_bytes += 4 * (i1 + i2) / TS_OFFSET_STRIDE;
out = (TSVector) palloc0(output_bytes);
SET_VARSIZE(out, output_bytes);
@@ -990,91 +1102,110 @@ tsvector_concat(PG_FUNCTION_ARGS)
* We must make out->size valid so that STRPTR(out) is sensible. We'll
* collapse out any unused space at the end.
*/
- out->size = in1->size + in2->size;
+ TS_SETCOUNT(out, i1 + i2);
- ptr = ARRPTR(out);
+ ptr = NULL;
data = STRPTR(out);
+ i = 0;
dataoff = 0;
+
+ INITPOS(pos1);
+ INITPOS(pos2);
+
+ /*
+ * we will need max position from first tsvector to add it positions of
+ * second tsvector
+ */
+ maxpos = get_maxpos(in1);
+
while (i1 && i2)
{
- int cmp = compareEntry(data1, ptr1, data2, ptr2);
+ char *lex = STRPTR(in1) + pos1,
+ *lex2 = STRPTR(in2) + pos2;
+
+ int lex_len = ENTRY_LEN(in1, ptr1),
+ lex2_len = ENTRY_LEN(in2, ptr2);
+
+ int cmp = tsCompareString(lex, lex_len, lex2, lex2_len, false);
if (cmp < 0)
{ /* in1 first */
- ptr->haspos = ptr1->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- }
+ tsvector_addlexeme(out, i, &dataoff,
+ lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
- ptr++;
- ptr1++;
+ INCRPTR(in1, ptr1, pos1);
i1--;
+ i++;
}
else if (cmp > 0)
{ /* in2 first */
- ptr->haspos = ptr2->haspos;
- ptr->len = ptr2->len;
- memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
- ptr->pos = dataoff;
- dataoff += ptr2->len;
- if (ptr->haspos)
+ char *new_lex;
+ WordEntry *we = UNWRAP_ENTRY(in2, ptr2);
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex2, lex2_len, NULL, 0);
+ if (we->npos > 0)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ int addlen;
+ WordEntryPos *apos = POSDATAPTR(new_lex, lex2_len);
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ addlen = add_pos(lex2, we, apos, 0, maxpos);
+ if (addlen > 0)
{
+ ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+ ptr->npos = addlen;
dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff += ptr->npos * sizeof(WordEntryPos);
}
}
- ptr++;
- ptr2++;
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i2--;
}
else
{
- ptr->haspos = ptr1->haspos | ptr2->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
+ char *new_lex;
+ int npos1 = ENTRY_NPOS(in1, ptr1),
+ npos2 = ENTRY_NPOS(in2, ptr2);
+ WordEntryPos *apos;
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+ apos = POSDATAPTR(new_lex, lex_len);
+
+ if (npos1 || npos2)
{
- if (ptr1->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- if (ptr2->haspos)
- dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
- }
- else /* must have ptr2->haspos */
+ int addlen;
+ char *lex2 = STRPTR(in2) + pos2;
+
+ ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+ if (npos1)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ /* add positions from left tsvector */
+ addlen = add_pos(lex, UNWRAP_ENTRY(in1, ptr1), apos, 0, 0);
+ ptr->npos = addlen;
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ if (npos2)
{
- dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ /* add positions from right right tsvector */
+ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, addlen, maxpos);
+ ptr->npos += addlen;
}
}
+ else /* npos in second should be > 0 */
+ {
+ /* add positions from right tsvector */
+ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+ ptr->npos = addlen;
+ }
+
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += ptr->npos * sizeof(WordEntryPos);
}
- ptr++;
- ptr1++;
- ptr2++;
+ INCRPTR(in1, ptr1, pos1);
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i1--;
i2--;
}
@@ -1082,45 +1213,43 @@ tsvector_concat(PG_FUNCTION_ARGS)
while (i1)
{
- ptr->haspos = ptr1->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- }
+ char *lex = STRPTR(in1) + pos1;
+ int lex_len = ENTRY_LEN(in1, ptr1);
- ptr++;
- ptr1++;
+ tsvector_addlexeme(out, i, &dataoff,
+ lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
+
+ INCRPTR(in1, ptr1, pos1);
+ i++;
i1--;
}
while (i2)
{
- ptr->haspos = ptr2->haspos;
- ptr->len = ptr2->len;
- memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
- ptr->pos = dataoff;
- dataoff += ptr2->len;
- if (ptr->haspos)
+ char *lex = STRPTR(in2) + pos2,
+ *new_lex;
+ int lex_len = ENTRY_LEN(in2, ptr2),
+ npos = ENTRY_NPOS(in2, ptr2);
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+ if (npos > 0)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ int addlen;
+ WordEntryPos *apos = POSDATAPTR(new_lex, lex_len);
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ addlen = add_pos(lex, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+ if (addlen > 0)
{
+ WordEntry *ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+ ptr->npos = addlen;
dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff += npos * sizeof(WordEntryPos);
}
}
- ptr++;
- ptr2++;
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i2--;
}
@@ -1137,12 +1266,10 @@ tsvector_concat(PG_FUNCTION_ARGS)
* Adjust sizes (asserting that we didn't overrun the original estimates)
* and collapse out any unused array entries.
*/
- output_size = ptr - ARRPTR(out);
- Assert(output_size <= out->size);
- out->size = output_size;
+ TS_SETCOUNT(out, i);
if (data != STRPTR(out))
memmove(STRPTR(out), data, dataoff);
- output_bytes = CALCDATASIZE(out->size, dataoff);
+ output_bytes = CALCDATASIZE(TS_COUNT(out), dataoff);
Assert(output_bytes <= VARSIZE(out));
SET_VARSIZE(out, output_bytes);
@@ -1194,35 +1321,26 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
* Check weight info or/and fill 'data' with the required positions
*/
static bool
-checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
+checkclass_str(WordEntryPos *pv, int npos, QueryOperand *val,
ExecPhraseData *data)
{
bool result = false;
- if (entry->haspos && (val->weight || data))
+ if (npos && (val->weight || data))
{
- WordEntryPosVector *posvec;
-
- /*
- * We can't use the _POSVECPTR macro here because the pointer to the
- * tsvector's lexeme storage is already contained in chkval->values.
- */
- posvec = (WordEntryPosVector *)
- (chkval->values + SHORTALIGN(entry->pos + entry->len));
-
if (val->weight && data)
{
- WordEntryPos *posvec_iter = posvec->pos;
+ WordEntryPos *posvec_iter = pv;
WordEntryPos *dptr;
/*
* Filter position information by weights
*/
- dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
+ dptr = data->pos = palloc(sizeof(WordEntryPos) * npos);
data->allocated = true;
/* Is there a position with a matching weight? */
- while (posvec_iter < posvec->pos + posvec->npos)
+ while (posvec_iter < (pv + npos))
{
/* If true, append this position to the data->pos */
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
@@ -1241,10 +1359,10 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
}
else if (val->weight)
{
- WordEntryPos *posvec_iter = posvec->pos;
+ WordEntryPos *posvec_iter = pv;
/* Is there a position with a matching weight? */
- while (posvec_iter < posvec->pos + posvec->npos)
+ while (posvec_iter < (pv + npos))
{
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
{
@@ -1257,8 +1375,8 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
}
else /* data != NULL */
{
- data->npos = posvec->npos;
- data->pos = posvec->pos;
+ data->npos = npos;
+ data->pos = pv;
data->allocated = false;
result = true;
}
@@ -1311,26 +1429,32 @@ static bool
checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
CHKVAL *chkval = (CHKVAL *) checkval;
- WordEntry *StopLow = chkval->arrb;
- WordEntry *StopHigh = chkval->arre;
- WordEntry *StopMiddle = StopHigh;
+ int StopLow = chkval->bidx;
+ int StopHigh = chkval->eidx;
+ int StopMiddle = StopHigh;
int difference = -1;
bool res = false;
+ char *lexeme;
+ WordEntry *entry;
/* Loop invariant: StopLow <= val < StopHigh */
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+ Assert(!entry->hasoff);
difference = tsCompareString(chkval->operand + val->distance,
val->length,
- chkval->values + StopMiddle->pos,
- StopMiddle->len,
+ lexeme,
+ entry->len,
false);
if (difference == 0)
{
/* Check weight info & fill 'data' with positions */
- res = checkclass_str(chkval, StopMiddle, val, data);
+ res = checkclass_str(POSDATAPTR(lexeme, entry->len),
+ entry->npos, val, data);
break;
}
else if (difference > 0)
@@ -1352,19 +1476,31 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
if (StopLow >= StopHigh)
StopMiddle = StopHigh;
- while ((!res || data) && StopMiddle < chkval->arre &&
- tsCompareString(chkval->operand + val->distance,
- val->length,
- chkval->values + StopMiddle->pos,
- StopMiddle->len,
- true) == 0)
+ while ((!res || data) && StopMiddle < chkval->eidx)
{
+ char *lexeme;
+ int cmp;
+ WordEntryPos *pv;
+
+ lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+ Assert(!entry->hasoff);
+ pv = POSDATAPTR(lexeme, entry->len);
+ cmp = tsCompareString(chkval->operand + val->distance,
+ val->length,
+ lexeme,
+ entry->len,
+ true);
+
+ if (cmp != 0)
+ break;
+
if (data)
{
/*
* We need to join position information
*/
- res = checkclass_str(chkval, StopMiddle, val, data);
+ res = checkclass_str(pv, entry->npos, val, data);
if (res)
{
@@ -1388,7 +1524,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
}
else
{
- res = checkclass_str(chkval, StopMiddle, val, NULL);
+ res = checkclass_str(pv, entry->npos, val, NULL);
}
StopMiddle++;
@@ -1935,9 +2071,9 @@ ts_match_vq(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(false);
}
- chkval.arrb = ARRPTR(val);
- chkval.arre = chkval.arrb + val->size;
- chkval.values = STRPTR(val);
+ chkval.bidx = 0;
+ chkval.eidx = TS_COUNT(val);
+ chkval.vec = val;
chkval.operand = GETOPERAND(query);
result = TS_execute(GETQUERY(query),
&chkval,
@@ -2001,12 +2137,15 @@ ts_match_tq(PG_FUNCTION_ARGS)
* that have a weight equal to one of the weights in 'weight' bitmask.
*/
static int
-check_weight(TSVector txt, WordEntry *wptr, int8 weight)
+check_weight(char *lexeme, WordEntry *wptr, int8 weight)
{
- int len = POSDATALEN(txt, wptr);
- int num = 0;
- WordEntryPos *ptr = POSDATAPTR(txt, wptr);
+ int len;
+ int num = 0;
+ WordEntryPos *ptr;
+ Assert(!wptr->hasoff);
+ len = wptr->len;
+ ptr = POSDATAPTR(lexeme, len);
while (len--)
{
if (weight & (1 << WEP_GETWEIGHT(*ptr)))
@@ -2017,31 +2156,34 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight)
}
#define compareStatWord(a,e,t) \
- tsCompareString((a)->lexeme, (a)->lenlexeme, \
- STRPTR(t) + (e)->pos, (e)->len, \
- false)
+ (tsCompareString((a)->lexeme, (a)->lenlexeme, \
+ t, (e)->len, false))
static void
insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
{
- WordEntry *we = ARRPTR(txt) + off;
+ WordEntry *we;
StatEntry *node = stat->root,
*pnode = NULL;
int n,
res = 0;
uint32 depth = 1;
+ char *lexeme;
+
+ lexeme = tsvector_getlexeme(txt, off, &we);
+ Assert(!we->hasoff);
if (stat->weight == 0)
- n = (we->haspos) ? POSDATALEN(txt, we) : 1;
+ n = (we->npos) ? we->npos : 1;
else
- n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
+ n = (we->npos) ? check_weight(lexeme, we, stat->weight) : 0;
if (n == 0)
return; /* nothing to insert */
while (node)
{
- res = compareStatWord(node, we, txt);
+ res = compareStatWord(node, we, lexeme);
if (res == 0)
{
@@ -2065,7 +2207,7 @@ insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector tx
node->ndoc = 1;
node->nentry = n;
node->lenlexeme = we->len;
- memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
+ memcpy(node->lexeme, lexeme, node->lenlexeme);
if (pnode == NULL)
{
@@ -2092,13 +2234,14 @@ chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVecto
uint32 low, uint32 high, uint32 offset)
{
uint32 pos;
- uint32 middle = (low + high) >> 1;
+ uint32 middle = (low + high) >> 1,
+ count = TS_COUNT(txt);
pos = (low + middle) >> 1;
- if (low != middle && pos >= offset && pos - offset < txt->size)
+ if (low != middle && pos >= offset && pos - offset < count)
insertStatEntry(persistentContext, stat, txt, pos - offset);
pos = (high + middle + 1) >> 1;
- if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
+ if (middle + 1 != high && pos >= offset && pos - offset < count)
insertStatEntry(persistentContext, stat, txt, pos - offset);
if (low != middle)
@@ -2125,7 +2268,8 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
TSVector txt = DatumGetTSVector(data);
uint32 i,
nbit = 0,
- offset;
+ offset,
+ count = TS_COUNT(txt);
if (stat == NULL)
{ /* Init in first */
@@ -2134,19 +2278,19 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
}
/* simple check of correctness */
- if (txt == NULL || txt->size == 0)
+ if (txt == NULL || count == 0)
{
if (txt && txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
- i = txt->size - 1;
+ i = count - 1;
for (; i > 0; i >>= 1)
nbit++;
nbit = 1 << nbit;
- offset = (nbit - txt->size) / 2;
+ offset = (nbit - count) / 2;
insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
@@ -2579,15 +2723,28 @@ tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
}
/* make tsvector value */
- datum = TSVectorGetDatum(make_tsvector(&prs));
- isnull = false;
-
- /* and insert it into tuple */
- rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
- 1, &tsvector_attr_num,
- &datum, &isnull);
-
- pfree(DatumGetPointer(datum));
+ if (prs.curwords)
+ {
+ datum = PointerGetDatum(make_tsvector(&prs));
+ isnull = false;
+ rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+ 1, &tsvector_attr_num,
+ &datum, &isnull);
+ pfree(DatumGetPointer(datum));
+ }
+ else
+ {
+ TSVector out = palloc(CALCDATASIZE(0, 0));
+
+ SET_VARSIZE(out, CALCDATASIZE(0, 0));
+ TS_SETCOUNT(out, 0);
+ datum = PointerGetDatum(out);
+ isnull = false;
+ rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+ 1, &tsvector_attr_num,
+ &datum, &isnull);
+ pfree(prs.words);
+ }
return PointerGetDatum(rettuple);
}
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 30d7c4bccd..47aa498432 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -24,30 +24,38 @@
* 2) int32 size - number of lexemes (WordEntry array entries)
* 3) Array of WordEntry - one per lexeme; must be sorted according to
* tsCompareString() (ie, memcmp of lexeme strings).
- * WordEntry->pos gives the number of bytes from end of WordEntry
- * array to start of lexeme's string, which is of length len.
+ * WordEntry have two types: offset or metadata (length of lexeme and number
+ * of positions). If it has offset then metadata will be by this offset.
* 4) Per-lexeme data storage:
- * lexeme string (not null-terminated)
- * if haspos is true:
+ * [4-byte aligned WordEntry] (if its WordEntry has offset)
+ * 2-byte aligned lexeme string (not null-terminated)
+ * if it has positions:
* padding byte if necessary to make the position data 2-byte aligned
- * uint16 number of positions that follow
* WordEntryPos[] positions
*
* The positions for each lexeme must be sorted.
*
- * Note, tsvectorsend/recv believe that sizeof(WordEntry) == 4
+ * Note, tsvector functions believe that sizeof(WordEntry) == 4
*/
-typedef struct
+#define TS_OFFSET_STRIDE 4
+
+typedef union
{
- uint32
- haspos:1,
- len:11, /* MAX 2Kb */
- pos:20; /* MAX 1Mb */
+ struct {
+ uint32 hasoff: 1,
+ offset: 31;
+ };
+ struct {
+ uint32 hasoff_: 1,
+ len:11,
+ npos: 16,
+ _unused: 4;
+ };
} WordEntry;
#define MAXSTRLEN ( (1<<11) - 1)
-#define MAXSTRPOS ( (1<<20) - 1)
+#define MAXSTRPOS ( (1<<30) - 1)
extern int compareWordEntryPos(const void *a, const void *b);
@@ -62,19 +70,6 @@ extern int compareWordEntryPos(const void *a, const void *b);
typedef uint16 WordEntryPos;
-typedef struct
-{
- uint16 npos;
- WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
-} WordEntryPosVector;
-
-/* WordEntryPosVector with exactly 1 entry */
-typedef struct
-{
- uint16 npos;
- WordEntryPos pos[1];
-} WordEntryPosVector1;
-
#define WEP_GETWEIGHT(x) ( (x) >> 14 )
#define WEP_GETPOS(x) ( (x) & 0x3fff )
@@ -90,13 +85,17 @@ typedef struct
typedef struct
{
int32 vl_len_; /* varlena header (do not touch directly!) */
- int32 size;
+ int32 size_; /* flags and lexemes count */
WordEntry entries[FLEXIBLE_ARRAY_MEMBER];
/* lexemes follow the entries[] array */
} TSVectorData;
typedef TSVectorData *TSVector;
+#define TS_FLAG_STRETCHED 0x80000000
+#define TS_COUNT(t) ((t)->size_ & 0x0FFFFFFF)
+#define TS_SETCOUNT(t,c) ((t)->size_ = (c) | TS_FLAG_STRETCHED)
+
#define DATAHDRSIZE (offsetof(TSVectorData, entries))
#define CALCDATASIZE(nentries, lenstr) (DATAHDRSIZE + (nentries) * sizeof(WordEntry) + (lenstr) )
@@ -104,24 +103,65 @@ typedef TSVectorData *TSVector;
#define ARRPTR(x) ( (x)->entries )
/* pointer to start of a tsvector's lexeme storage */
-#define STRPTR(x) ( (char *) &(x)->entries[(x)->size] )
+#define STRPTR(x) ( (char *) &(x)->entries[TS_COUNT(x)] )
-#define _POSVECPTR(x, e) ((WordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
-#define POSDATALEN(x,e) ( ( (e)->haspos ) ? (_POSVECPTR(x,e)->npos) : 0 )
-#define POSDATAPTR(x,e) (_POSVECPTR(x,e)->pos)
+/* for WordEntry with offset return its WordEntry with other properties */
+#define UNWRAP_ENTRY(x,we) \
+ ((we)->hasoff? (WordEntry *)(STRPTR(x) + (we)->offset): (we))
+
+/*
+ * helpers used when we're not sure that WordEntry
+ * contains ether offset or len
+ */
+#define ENTRY_NPOS(x,we) (UNWRAP_ENTRY(x,we)->npos)
+#define ENTRY_LEN(x,we) (UNWRAP_ENTRY(x,we)->len)
+
+/* pointer to start of positions */
+#define POSDATAPTR(lex, len) ((WordEntryPos *) (lex + SHORTALIGN(len)))
+
+/* set default offset in tsvector data */
+#define INITPOS(p) ((p) = sizeof(WordEntry))
+
+/* increment entry and offset by given WordEntry */
+#define INCRPTR(x,w,p) \
+do { \
+ WordEntry *y = (w); \
+ if ((w)->hasoff) \
+ { \
+ y = (WordEntry *) (STRPTR(x) + (w)->offset); \
+ (p) = (w)->offset + sizeof(WordEntry); \
+ } \
+ (w)++; \
+ Assert(!y->hasoff); \
+ (p) += SHORTALIGN(y->len) + y->npos * sizeof(WordEntryPos); \
+ if ((w) - ARRPTR(x) < TS_COUNT(x) && w->hasoff) \
+ (p) = INTALIGN(p) + sizeof(WordEntry); \
+} while (0);
+
+/* used to calculate tsvector size in in tsvector constructors */
+#define INCRSIZE(s,i,l,n) /* size,index,len,npos */ \
+do { \
+ if ((i) % TS_OFFSET_STRIDE == 0) \
+ (s) = INTALIGN(s) + sizeof(WordEntry); \
+ else \
+ (s) = SHORTALIGN(s); \
+ (s) += (l); \
+ (s) = (n)? SHORTALIGN(s) + (n) * sizeof(WordEntryPos) : (s); \
+} while (0);
/*
* fmgr interface macros
*/
-#define DatumGetTSVector(X) ((TSVector) PG_DETOAST_DATUM(X))
-#define DatumGetTSVectorCopy(X) ((TSVector) PG_DETOAST_DATUM_COPY(X))
+TSVector tsvector_upgrade(Datum orig, bool copy);
+
+#define DatumGetTSVector(X) tsvector_upgrade((X), false)
+#define DatumGetTSVectorCopy(X) tsvector_upgrade((X), true)
#define TSVectorGetDatum(X) PointerGetDatum(X)
#define PG_GETARG_TSVECTOR(n) DatumGetTSVector(PG_GETARG_DATUM(n))
#define PG_GETARG_TSVECTOR_COPY(n) DatumGetTSVectorCopy(PG_GETARG_DATUM(n))
#define PG_RETURN_TSVECTOR(x) return TSVectorGetDatum(x)
-
/*
* TSQuery
*
@@ -239,4 +279,22 @@ typedef TSQueryData *TSQuery;
#define PG_GETARG_TSQUERY_COPY(n) DatumGetTSQueryCopy(PG_GETARG_DATUM(n))
#define PG_RETURN_TSQUERY(x) return TSQueryGetDatum(x)
+int tsvector_getoffset(TSVector vec, int idx, WordEntry **we);
+char *tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+ char *lexeme, int lexeme_len, WordEntryPos *pos, int npos);
+
+/* Returns lexeme and its entry by given index from TSVector */
+inline static char *
+tsvector_getlexeme(TSVector vec, int idx, WordEntry **we)
+{
+ Assert(idx >=0 && idx < TS_COUNT(vec));
+
+ /*
+ * we do not allow we == NULL because returned lexeme is not \0 ended,
+ * and always should be used with we->len
+ */
+ Assert(we != NULL);
+ return STRPTR(vec) + tsvector_getoffset(vec, idx, we);
+}
+
#endif /* _PG_TSTYPE_H_ */
On Tue, Aug 1, 2017 at 10:08 AM, Ildus Kurbangaliev
<i.kurbangaliev@postgrespro.ru> wrote:
Historically tsvector type can't hold more than 1MB data.
I want to propose a patch that removes that limit.That limit is created by 'pos' field from WordEntry, which have only
20 bits for storage.In the proposed patch I removed this field and instead of it I keep
offsets only at each Nth item in WordEntry's array.
So this would break pg_upgrade for tsvector columns?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, 1 Aug 2017 14:56:54 -0400
Robert Haas <robertmhaas@gmail.com> wrote:
On Tue, Aug 1, 2017 at 10:08 AM, Ildus Kurbangaliev
<i.kurbangaliev@postgrespro.ru> wrote:Historically tsvector type can't hold more than 1MB data.
I want to propose a patch that removes that limit.That limit is created by 'pos' field from WordEntry, which have only
20 bits for storage.In the proposed patch I removed this field and instead of it I keep
offsets only at each Nth item in WordEntry's array.So this would break pg_upgrade for tsvector columns?
I added a function that will convert old tsvectors on the fly. It's the
approach used in hstore before.
Regards,
Ildus Kurbangaliev
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, Aug 1, 2017 at 3:10 PM, Ildus K <i.kurbangaliev@postgrespro.ru> wrote:
So this would break pg_upgrade for tsvector columns?
I added a function that will convert old tsvectors on the fly. It's the
approach used in hstore before.
Does that mean the answer to the question that I asked is "yes, but I
have a workaround" or does it mean that the answer is "no"?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, 1 Aug 2017 15:33:08 -0400
Robert Haas <robertmhaas@gmail.com> wrote:
On Tue, Aug 1, 2017 at 3:10 PM, Ildus K
<i.kurbangaliev@postgrespro.ru> wrote:So this would break pg_upgrade for tsvector columns?
I added a function that will convert old tsvectors on the fly. It's
the approach used in hstore before.Does that mean the answer to the question that I asked is "yes, but I
have a workaround" or does it mean that the answer is "no"?
It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.
Regards,
Ildus Kurbangaliev
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 01.08.2017 22:00, Ildus K wrote:
On Tue, 1 Aug 2017 15:33:08 -0400
Robert Haas <robertmhaas@gmail.com> wrote:On Tue, Aug 1, 2017 at 3:10 PM, Ildus K
<i.kurbangaliev@postgrespro.ru> wrote:So this would break pg_upgrade for tsvector columns?
I added a function that will convert old tsvectors on the fly. It's
the approach used in hstore before.Does that mean the answer to the question that I asked is "yes, but I
have a workaround" or does it mean that the answer is "no"?It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.
I'm not familiar with pg_upgrade, but want to ask: should this
workaround be part of pg_upgrade?
Greetings,
Torsten
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Wed, 9 Aug 2017 09:01:44 +0200
Torsten Zuehlsdorff <mailinglists@toco-domains.de> wrote:
On 01.08.2017 22:00, Ildus K wrote:
On Tue, 1 Aug 2017 15:33:08 -0400
Robert Haas <robertmhaas@gmail.com> wrote:On Tue, Aug 1, 2017 at 3:10 PM, Ildus K
<i.kurbangaliev@postgrespro.ru> wrote:So this would break pg_upgrade for tsvector columns?
I added a function that will convert old tsvectors on the fly.
It's the approach used in hstore before.Does that mean the answer to the question that I asked is "yes,
but I have a workaround" or does it mean that the answer is "no"?It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.I'm not familiar with pg_upgrade, but want to ask: should this
workaround be part of pg_upgrade?Greetings,
Torsten
I chose the way when the data remains the same, until the user decides
to update it. I'm not so familiar with pg_upgrade myself and I don't
see now how the data will be converted with it, but it will anyway
increase downtime which is the shorter the better.
--
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, Aug 1, 2017 at 4:00 PM, Ildus K <i.kurbangaliev@postgrespro.ru> wrote:
It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.
Hmm, that seems like a real fix, not just a workaround. If you can
transparently read the old format, there's no problem. Not sure about
performance, though.
The patch doesn't really conform to our coding standards, though, so
you need to clean it up (or, if you're not sure what you need to do,
you need to have someone who knows how PostgreSQL code needs to look
review it for you).
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Wed, Aug 9, 2017 at 6:38 PM, Robert Haas <robertmhaas@gmail.com> wrote:
The patch doesn't really conform to our coding standards, though, so
you need to clean it up (or, if you're not sure what you need to do,
you need to have someone who knows how PostgreSQL code needs to look
review it for you).
The documentation has a couple of rules for coding conventions:
https://www.postgresql.org/docs/9.6/static/source.html
--
Michael
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Thu, Aug 10, 2017 at 7:37 AM, Michael Paquier <michael.paquier@gmail.com>
wrote:
On Wed, Aug 9, 2017 at 6:38 PM, Robert Haas <robertmhaas@gmail.com> wrote:
The patch doesn't really conform to our coding standards, though, so
you need to clean it up (or, if you're not sure what you need to do,
you need to have someone who knows how PostgreSQL code needs to look
review it for you).The documentation has a couple of rules for coding conventions:
https://www.postgresql.org/docs/9.6/static/source.html
+1
Ildus, from the first glance I see at least following violations of
PostgreSQL coding standards in your code.
+/*
+ * Converts tsvector with the old structure to current.
+ * @orig - tsvector to convert,
+ * @copy - return copy of tsvector, it has a meaning when tsvector doensn't
+ * need to be converted.
+ */
This comment will be reflowed by pgindent. Also we don't use '@' for
parameters description in comments.
https://www.postgresql.org/docs/9.6/static/source-format.html
+TSVector
+tsvector_upgrade(Datum orig, bool copy)
+{
+ int i,
+ dataoff = 0,
+ datalen = 0,
+ totallen;
+ TSVector in,
+ out;
You have random mix of tabs and spaces here.
+ {
+ stroff = SHORTALIGN(stroff); \
+ entry->hasoff = 0;
+ entry->len = lexeme_len;
+ entry->npos = npos;
+ }
What this backslash is doing here?
There are other similar (and probably different) violations of coding
standard over the code. Ildus, please check you patches carefully before
publishing.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
On Wed, Aug 9, 2017 at 7:38 PM, Robert Haas <robertmhaas@gmail.com> wrote:
On Tue, Aug 1, 2017 at 4:00 PM, Ildus K <i.kurbangaliev@postgrespro.ru>
wrote:It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.Hmm, that seems like a real fix, not just a workaround. If you can
transparently read the old format, there's no problem. Not sure about
performance, though.
+1
Ildus, I think we need to benchmark reading of the old format. There would
be tradeoff between performance of old format reading and amount of extra
code needed. Once we will have benchmarks we can consider whether this is
the solution we would like to buy.
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
Alexander Korotkov <a.korotkov@postgrespro.ru> writes:
...
You have random mix of tabs and spaces here.
It's worth running pgindent over your code before submitting. It should
be pretty easy to set that up nowadays, see src/tools/pgindent/README.
(If you find any portability problems while trying to install pgindent,
please let me know.)
regards, tom lane
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Thu, 10 Aug 2017 11:46:55 -0400
Tom Lane <tgl@sss.pgh.pa.us> wrote:
Alexander Korotkov <a.korotkov@postgrespro.ru> writes:
...
You have random mix of tabs and spaces here.It's worth running pgindent over your code before submitting. It
should be pretty easy to set that up nowadays, see
src/tools/pgindent/README. (If you find any portability problems
while trying to install pgindent, please let me know.)
Attached a new version of the patch. It mostly contains cosmetic
changes. I rebased it to current master, ran pgindent and fixed
formatting errors.
--
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
Attachments:
tsvector_mixed_positions_v2.patchtext/x-patchDownload
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index 34fe4c5b3c..9585a25003 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -26,7 +26,7 @@ DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES))
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
- to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o
+ to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o ts_compat.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 35d9ab276c..aa87fd8a04 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -156,13 +156,10 @@ TSVector
make_tsvector(ParsedText *prs)
{
int i,
- j,
lenstr = 0,
- totallen;
+ totallen,
+ stroff = 0;
TSVector in;
- WordEntry *ptr;
- char *str;
- int stroff;
/* Merge duplicate words */
if (prs->curwords > 0)
@@ -171,12 +168,9 @@ make_tsvector(ParsedText *prs)
/* Determine space needed */
for (i = 0; i < prs->curwords; i++)
{
- lenstr += prs->words[i].len;
- if (prs->words[i].alen)
- {
- lenstr = SHORTALIGN(lenstr);
- lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
- }
+ int npos = prs->words[i].alen ? prs->words[i].pos.apos[0] : 0;
+
+ INCRSIZE(lenstr, i, prs->words[i].len, npos);
}
if (lenstr > MAXSTRPOS)
@@ -187,41 +181,21 @@ make_tsvector(ParsedText *prs)
totallen = CALCDATASIZE(prs->curwords, lenstr);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
- in->size = prs->curwords;
+ TS_SETCOUNT(in, prs->curwords);
- ptr = ARRPTR(in);
- str = STRPTR(in);
- stroff = 0;
for (i = 0; i < prs->curwords; i++)
{
- ptr->len = prs->words[i].len;
- ptr->pos = stroff;
- memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
- stroff += prs->words[i].len;
- pfree(prs->words[i].word);
+ int npos = 0;
+
if (prs->words[i].alen)
- {
- int k = prs->words[i].pos.apos[0];
- WordEntryPos *wptr;
+ npos = prs->words[i].pos.apos[0];
- if (k > 0xFFFF)
- elog(ERROR, "positions array too long");
+ tsvector_addlexeme(in, i, &stroff, prs->words[i].word, prs->words[i].len,
+ prs->words[i].pos.apos + 1, npos);
- ptr->haspos = 1;
- stroff = SHORTALIGN(stroff);
- *(uint16 *) (str + stroff) = (uint16) k;
- wptr = POSDATAPTR(in, ptr);
- for (j = 0; j < k; j++)
- {
- WEP_SETWEIGHT(wptr[j], 0);
- WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
- }
- stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
+ pfree(prs->words[i].word);
+ if (prs->words[i].alen)
pfree(prs->words[i].pos.apos);
- }
- else
- ptr->haspos = 0;
- ptr++;
}
if (prs->words)
@@ -251,7 +225,6 @@ to_tsvector_byid(PG_FUNCTION_ARGS)
PG_FREE_IF_COPY(in, 1);
out = make_tsvector(&prs);
-
PG_RETURN_TSVECTOR(out);
}
diff --git a/src/backend/tsearch/ts_compat.c b/src/backend/tsearch/ts_compat.c
new file mode 100644
index 0000000000..bc45109241
--- /dev/null
+++ b/src/backend/tsearch/ts_compat.c
@@ -0,0 +1,84 @@
+#include "postgres.h"
+#include "tsearch/ts_type.h"
+
+/*
+ * Definition of old WordEntry struct in TSVector. Because of limitations
+ * in size (max 1MB for lexemes), the format has changed
+ */
+typedef struct
+{
+ uint32
+ haspos:1,
+ len:11,
+ pos:20;
+} OldWordEntry;
+
+typedef struct
+{
+ uint16 npos;
+ WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
+} OldWordEntryPosVector;
+
+#define OLDSTRPTR(x) ( (char *) &(x)->entries[x->size_] )
+#define _OLDPOSVECPTR(x, e) \
+ ((OldWordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
+#define OLDPOSDATALEN(x,e) ( ( (e)->haspos ) ? (_OLDPOSVECPTR(x,e)->npos) : 0 )
+#define OLDPOSDATAPTR(x,e) (_OLDPOSVECPTR(x,e)->pos)
+
+/*
+ * Converts tsvector with the old structure to current.
+ * Can return copy of tsvector, but it has a meaning when tsvector doensn't
+ * need to be converted.
+ */
+TSVector
+tsvector_upgrade(Datum orig, bool copy)
+{
+ int i,
+ dataoff = 0,
+ datalen = 0,
+ totallen;
+ TSVector in,
+ out;
+
+ in = (TSVector) PG_DETOAST_DATUM(orig);
+
+ /* If already in new format, return as is */
+ if (in->size_ & TS_FLAG_STRETCHED)
+ {
+ TSVector out;
+
+ if (!copy)
+ return in;
+
+ out = (TSVector) palloc(VARSIZE(in));
+ memcpy(out, in, VARSIZE(in));
+ return out;
+ }
+
+ /*
+ * Calculate required size. We don't check any sizes here because old
+ * format was limited with 1MB
+ */
+ for (i = 0; i < in->size_; i++)
+ {
+ OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+
+ INCRSIZE(datalen, i, entry->len, OLDPOSDATALEN(in, entry));
+ }
+
+ totallen = CALCDATASIZE(in->size_, datalen);
+ out = (TSVector) palloc0(totallen);
+ SET_VARSIZE(out, totallen);
+ TS_SETCOUNT(out, in->size_);
+
+ for (i = 0; i < in->size_; i++)
+ {
+ OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+
+ tsvector_addlexeme(out, i, &dataoff,
+ OLDSTRPTR(in) + entry->pos, entry->len,
+ OLDPOSDATAPTR(in, entry), OLDPOSDATALEN(in, entry));
+ }
+
+ return out;
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 320c7f1a61..9b2fc4be04 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -202,7 +202,8 @@ compute_tsvector_stats(VacAttrStats *stats,
TSVector vector;
WordEntry *curentryptr;
char *lexemesptr;
- int j;
+ int j,
+ pos;
vacuum_delay_point();
@@ -236,7 +237,9 @@ compute_tsvector_stats(VacAttrStats *stats,
*/
lexemesptr = STRPTR(vector);
curentryptr = ARRPTR(vector);
- for (j = 0; j < vector->size; j++)
+
+ INITPOS(pos);
+ for (j = 0; j < TS_COUNT(vector); j++)
{
bool found;
@@ -246,8 +249,8 @@ compute_tsvector_stats(VacAttrStats *stats,
* make a copy of it. This way we can free the tsvector value
* once we've processed all its lexemes.
*/
- hash_key.lexeme = lexemesptr + curentryptr->pos;
- hash_key.length = curentryptr->len;
+ hash_key.lexeme = lexemesptr + pos;
+ hash_key.length = ENTRY_LEN(vector, curentryptr);
/* Lookup current lexeme in hashtable, adding it if new */
item = (TrackItem *) hash_search(lexemes_tab,
@@ -280,7 +283,7 @@ compute_tsvector_stats(VacAttrStats *stats,
}
/* Advance to the next WordEntry in the tsvector */
- curentryptr++;
+ INCRPTR(vector, curentryptr, pos);
}
/* If the vector was toasted, free the detoasted copy. */
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 83a939dfd5..75a4364b94 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -67,23 +67,27 @@ gin_extract_tsvector(PG_FUNCTION_ARGS)
TSVector vector = PG_GETARG_TSVECTOR(0);
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
Datum *entries = NULL;
+ int tscount = TS_COUNT(vector);
- *nentries = vector->size;
- if (vector->size > 0)
+ *nentries = tscount;
+ if (tscount > 0)
{
int i;
+ uint32 pos;
+
WordEntry *we = ARRPTR(vector);
- entries = (Datum *) palloc(sizeof(Datum) * vector->size);
+ entries = (Datum *) palloc(sizeof(Datum) * tscount);
- for (i = 0; i < vector->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < tscount; i++)
{
text *txt;
- txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
+ txt = cstring_to_text_with_len(STRPTR(vector) + pos,
+ ENTRY_LEN(vector, we));
entries[i] = PointerGetDatum(txt);
-
- we++;
+ INCRPTR(vector, we, pos);
}
}
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 7ce2699b5c..18d3de3725 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -192,28 +192,33 @@ gtsvector_compress(PG_FUNCTION_ARGS)
int32 *arr;
WordEntry *ptr = ARRPTR(val);
char *words = STRPTR(val);
+ const int tscount = TS_COUNT(val);
+ uint32 pos;
- len = CALCGTSIZE(ARRKEY, val->size);
+ len = CALCGTSIZE(ARRKEY, tscount);
res = (SignTSVector *) palloc(len);
SET_VARSIZE(res, len);
res->flag = ARRKEY;
arr = GETARR(res);
- len = val->size;
+ len = tscount;
+
+ INITPOS(pos);
while (len--)
{
pg_crc32 c;
INIT_LEGACY_CRC32(c);
- COMP_LEGACY_CRC32(c, words + ptr->pos, ptr->len);
+ COMP_LEGACY_CRC32(c, words + pos, ENTRY_LEN(val, ptr));
FIN_LEGACY_CRC32(c);
*arr = *(int32 *) &c;
arr++;
- ptr++;
+
+ INCRPTR(val, ptr, pos);
}
- len = uniqueint(GETARR(res), val->size);
- if (len != val->size)
+ len = uniqueint(GETARR(res), tscount);
+ if (len != tscount)
{
/*
* there is a collision of hash-function; len is always less than
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 4577bcc0b8..cb859d9b47 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -53,43 +53,39 @@ word_distance(int32 w)
static int
cnt_length(TSVector t)
{
- WordEntry *ptr = ARRPTR(t),
- *end = (WordEntry *) STRPTR(t);
- int len = 0;
+ int i,
+ len = 0;
- while (ptr < end)
+ for (i = 0; i < TS_COUNT(t); i++)
{
- int clen = POSDATALEN(t, ptr);
-
- if (clen == 0)
- len += 1;
- else
- len += clen;
+ WordEntry *entry = UNWRAP_ENTRY(t, ARRPTR(t) + i);
- ptr++;
+ Assert(!entry->hasoff);
+ len += (entry->npos == 0) ? 1 : entry->npos;
}
return len;
}
-#define WordECompareQueryItem(e,q,p,i,m) \
- tsCompareString((q) + (i)->distance, (i)->length, \
- (e) + (p)->pos, (p)->len, (m))
-
-
/*
* Returns a pointer to a WordEntry's array corresponding to 'item' from
* tsvector 't'. 'q' is the TSQuery containing 'item'.
* Returns NULL if not found.
*/
-static WordEntry *
+static int
find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
{
- WordEntry *StopLow = ARRPTR(t);
- WordEntry *StopHigh = (WordEntry *) STRPTR(t);
- WordEntry *StopMiddle = StopHigh;
+#define WordECompareQueryItem(s,l,q,i,m) \
+ tsCompareString((q) + (i)->distance, (i)->length, \
+ s, l, (m))
+
+ int StopLow = 0;
+ int StopHigh = TS_COUNT(t);
+ int StopMiddle = StopHigh;
int difference;
+ char *lexeme;
+ WordEntry *we;
*nitem = 0;
@@ -97,7 +93,12 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
- difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
+ lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+ Assert(!we->hasoff);
+ difference = WordECompareQueryItem(lexeme, we->len,
+ GETOPERAND(q), item, false);
+
if (difference == 0)
{
StopHigh = StopMiddle;
@@ -117,18 +118,22 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
*nitem = 0;
- while (StopMiddle < (WordEntry *) STRPTR(t) &&
- WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
+ while (StopMiddle < TS_COUNT(t))
{
+ lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+ Assert(!we->hasoff);
+ if (WordECompareQueryItem(lexeme, we->len, GETOPERAND(q), item, true) != 0)
+ break;
+
(*nitem)++;
StopMiddle++;
}
}
- return (*nitem > 0) ? StopHigh : NULL;
+ return (*nitem > 0) ? StopHigh : -1;
}
-
/*
* sort QueryOperands by (length, word)
*/
@@ -200,15 +205,13 @@ SortAndUniqItems(TSQuery q, int *size)
static float
calc_rank_and(const float *w, TSVector t, TSQuery q)
{
- WordEntryPosVector **pos;
- WordEntryPosVector1 posnull;
- WordEntryPosVector *POSNULL;
+ WordEntryPos **pos;
+ uint16 *npos;
+ WordEntryPos posnull[1] = {0};
int i,
k,
l,
p;
- WordEntry *entry,
- *firstentry;
WordEntryPos *post,
*ct;
int32 dimt,
@@ -225,41 +228,55 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
pfree(item);
return calc_rank_or(w, t, q);
}
- pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
+ pos = (WordEntryPos **) palloc0(sizeof(WordEntryPos *) * q->size);
+ npos = (uint16 *) palloc0(sizeof(uint16) * q->size);
- /* A dummy WordEntryPos array to use when haspos is false */
- posnull.npos = 1;
- posnull.pos[0] = 0;
- WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
- POSNULL = (WordEntryPosVector *) &posnull;
+ /* posnull is a dummy WordEntryPos array to use when npos == 0 */
+ WEP_SETPOS(posnull[0], MAXENTRYPOS - 1);
for (i = 0; i < size; i++)
{
- firstentry = entry = find_wordentry(t, q, item[i], &nitem);
- if (!entry)
+ int idx = find_wordentry(t, q, item[i], &nitem),
+ firstidx;
+
+ if (idx == -1)
continue;
- while (entry - firstentry < nitem)
+ firstidx = idx;
+
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
- pos[i] = _POSVECPTR(t, entry);
+ WordEntry *entry;
+
+ char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
+ {
+ pos[i] = POSDATAPTR(lexeme, entry->len);
+ npos[i] = entry->npos;
+ }
else
- pos[i] = POSNULL;
+ {
+ pos[i] = posnull;
+ npos[i] = 1;
+ }
+
+ post = pos[i];
+ dimt = npos[i];
- dimt = pos[i]->npos;
- post = pos[i]->pos;
for (k = 0; k < i; k++)
{
if (!pos[k])
continue;
- lenct = pos[k]->npos;
- ct = pos[k]->pos;
+ lenct = npos[k];
+ ct = pos[k];
for (l = 0; l < dimt; l++)
{
for (p = 0; p < lenct; p++)
{
dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
- if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
+ if (dist || (dist == 0 && (pos[i] == posnull || pos[k] == posnull)))
{
float curw;
@@ -272,10 +289,11 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
}
}
- entry++;
+ idx++;
}
}
pfree(pos);
+ pfree(npos);
pfree(item);
return res;
}
@@ -283,9 +301,8 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
static float
calc_rank_or(const float *w, TSVector t, TSQuery q)
{
- WordEntry *entry,
- *firstentry;
- WordEntryPosVector1 posnull;
+ /* A dummy WordEntryPos array to use when lexeme hasn't positions */
+ WordEntryPos posnull[1] = {0};
WordEntryPos *post;
int32 dimt,
j,
@@ -295,33 +312,37 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
QueryOperand **item;
int size = q->size;
- /* A dummy WordEntryPos array to use when haspos is false */
- posnull.npos = 1;
- posnull.pos[0] = 0;
-
item = SortAndUniqItems(q, &size);
for (i = 0; i < size; i++)
{
+ int idx,
+ firstidx;
float resj,
wjm;
int32 jm;
- firstentry = entry = find_wordentry(t, q, item[i], &nitem);
- if (!entry)
+ idx = find_wordentry(t, q, item[i], &nitem);
+ if (idx == -1)
continue;
- while (entry - firstentry < nitem)
+ firstidx = idx;
+
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
+ WordEntry *entry;
+ char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
{
- dimt = POSDATALEN(t, entry);
- post = POSDATAPTR(t, entry);
+ dimt = entry->npos;
+ post = POSDATAPTR(lexeme, entry->len);
}
else
{
- dimt = posnull.npos;
- post = posnull.pos;
+ dimt = 1;
+ post = posnull;
}
resj = 0.0;
@@ -345,7 +366,7 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
*/
res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
- entry++;
+ idx++;
}
}
if (size > 0)
@@ -361,7 +382,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
float res = 0.0;
int len;
- if (!t->size || !q->size)
+ if (!TS_COUNT(t) || !q->size)
return 0.0;
/* XXX: What about NOT? */
@@ -373,7 +394,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
if (res < 0)
res = 1e-20f;
- if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
+ if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(t) > 0)
res /= log((double) (cnt_length(t) + 1)) / log(2.0);
if (method & RANK_NORM_LENGTH)
@@ -385,11 +406,11 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
/* RANK_NORM_EXTDIST not applicable */
- if ((method & RANK_NORM_UNIQ) && t->size > 0)
- res /= (float) (t->size);
+ if ((method & RANK_NORM_UNIQ) && TS_COUNT(t) > 0)
+ res /= (float) (TS_COUNT(t));
- if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
- res /= log((double) (t->size + 1)) / log(2.0);
+ if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(t) > 0)
+ res /= log((double) (TS_COUNT(t) + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
res /= (res + 1);
@@ -504,13 +525,13 @@ typedef struct
struct
{ /* compiled doc representation */
QueryItem **items;
- int16 nitem;
+ int32 nitem;
} query;
struct
{ /* struct is used for preparing doc
* representation */
QueryItem *item;
- WordEntry *entry;
+ int32 idx;
} map;
} data;
WordEntryPos pos;
@@ -526,10 +547,10 @@ compareDocR(const void *va, const void *vb)
{
if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
{
- if (a->data.map.entry == b->data.map.entry)
+ if (a->data.map.idx == b->data.map.idx)
return 0;
- return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
+ return (a->data.map.idx > b->data.map.idx) ? 1 : -1;
}
return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
@@ -724,9 +745,6 @@ static DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
- WordEntry *entry,
- *firstentry;
- WordEntryPos *post;
int32 dimt, /* number of 'post' items */
j,
i,
@@ -743,29 +761,38 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
*/
for (i = 0; i < qr->query->size; i++)
{
+ int idx,
+ firstidx;
QueryOperand *curoperand;
+ WordEntryPos *post;
if (item[i].type != QI_VAL)
continue;
curoperand = &item[i].qoperand;
- firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
- if (!entry)
+ idx = find_wordentry(txt, qr->query, curoperand, &nitem);
+ if (idx < 0)
continue;
+ firstidx = idx;
+
/* iterations over entries in tsvector */
- while (entry - firstentry < nitem)
+ while (idx - firstidx < nitem)
{
- if (entry->haspos)
+ WordEntry *entry;
+ char *lex = tsvector_getlexeme(txt, idx, &entry);
+
+ Assert(!entry->hasoff);
+ if (entry->npos)
{
- dimt = POSDATALEN(txt, entry);
- post = POSDATAPTR(txt, entry);
+ dimt = entry->npos;
+ post = POSDATAPTR(lex, entry->len);
}
else
{
/* ignore words without positions */
- entry++;
+ idx++;
continue;
}
@@ -782,13 +809,12 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
{
doc[cur].pos = post[j];
- doc[cur].data.map.entry = entry;
+ doc[cur].data.map.idx = idx;
doc[cur].data.map.item = (QueryItem *) curoperand;
cur++;
}
}
-
- entry++;
+ idx++;
}
}
@@ -814,7 +840,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
while (rptr - doc < cur)
{
if (rptr->pos == (rptr - 1)->pos &&
- rptr->data.map.entry == (rptr - 1)->data.map.entry)
+ rptr->data.map.idx == (rptr - 1)->data.map.idx)
{
storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
storage.data.query.nitem++;
@@ -917,7 +943,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
NExtent++;
}
- if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
+ if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(txt) > 0)
Wdoc /= log((double) (cnt_length(txt) + 1));
if (method & RANK_NORM_LENGTH)
@@ -930,11 +956,11 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
Wdoc /= ((double) NExtent) / SumDist;
- if ((method & RANK_NORM_UNIQ) && txt->size > 0)
- Wdoc /= (double) (txt->size);
+ if ((method & RANK_NORM_UNIQ) && TS_COUNT(txt) > 0)
+ Wdoc /= (double) (TS_COUNT(txt));
- if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
- Wdoc /= log((double) (txt->size + 1)) / log(2.0);
+ if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(txt) > 0)
+ Wdoc /= log((double) (TS_COUNT(txt) + 1)) / log(2.0);
if (method & RANK_NORM_RDIVRPLUS1)
Wdoc /= (Wdoc + 1);
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 6f66c1f58c..de34df0c3d 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -23,8 +23,8 @@
typedef struct
{
WordEntry entry; /* must be first! */
+ size_t offset; /* offset of lexeme in some buffer */
WordEntryPos *pos;
- int poslen; /* number of elements in pos */
} WordEntryIN;
@@ -79,14 +79,30 @@ uniquePos(WordEntryPos *a, int l)
/* Compare two WordEntryIN values for qsort */
static int
-compareentry(const void *va, const void *vb, void *arg)
+compareentry_in(const void *va, const void *vb, void *arg)
{
const WordEntryIN *a = (const WordEntryIN *) va;
const WordEntryIN *b = (const WordEntryIN *) vb;
char *BufferStr = (char *) arg;
- return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
- &BufferStr[b->entry.pos], b->entry.len,
+ return tsCompareString(&BufferStr[a->offset], a->entry.len,
+ &BufferStr[b->offset], b->entry.len,
+ false);
+}
+
+/* Compare two WordEntry values for qsort */
+static int
+compareentry(const void *va, const void *vb, void *arg)
+{
+ const WordEntry *a = (const WordEntry *) va;
+ const WordEntry *b = (const WordEntry *) vb;
+ TSVector tsv = (TSVector) arg;
+
+ uint32 offset1 = tsvector_getoffset(tsv, a - ARRPTR(tsv), NULL),
+ offset2 = tsvector_getoffset(tsv, b - ARRPTR(tsv), NULL);
+
+ return tsCompareString(STRPTR(tsv) + offset1, ENTRY_LEN(tsv, a),
+ STRPTR(tsv) + offset2, ENTRY_LEN(tsv, b),
false);
}
@@ -97,14 +113,15 @@ compareentry(const void *va, const void *vb, void *arg)
static int
uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
{
- int buflen;
+ int buflen,
+ i = 0;
WordEntryIN *ptr,
*res;
Assert(l >= 1);
if (l > 1)
- qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
+ qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry_in,
(void *) buf);
buflen = 0;
@@ -112,67 +129,76 @@ uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
ptr = a + 1;
while (ptr - a < l)
{
+ Assert(!ptr->entry.hasoff);
+
if (!(ptr->entry.len == res->entry.len &&
- strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
- res->entry.len) == 0))
+ strncmp(&buf[ptr->offset], &buf[res->offset], res->entry.len) == 0))
{
/* done accumulating data into *res, count space needed */
+ buflen = SHORTALIGN(buflen);
+ if (i++ % TS_OFFSET_STRIDE == 0)
+ {
+ buflen = INTALIGN(buflen);
+ buflen += sizeof(WordEntry);
+ }
+
buflen += res->entry.len;
- if (res->entry.haspos)
+ if (res->entry.npos)
{
- res->poslen = uniquePos(res->pos, res->poslen);
+ res->entry.npos = uniquePos(res->pos, res->entry.npos);
buflen = SHORTALIGN(buflen);
- buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ buflen += res->entry.npos * sizeof(WordEntryPos);
}
res++;
if (res != ptr)
- memcpy(res, ptr, sizeof(WordEntryIN));
+ *res = *ptr;
}
- else if (ptr->entry.haspos)
+ else if (ptr->entry.npos)
{
- if (res->entry.haspos)
+ if (res->entry.npos)
{
/* append ptr's positions to res's positions */
- int newlen = ptr->poslen + res->poslen;
+ int newlen = ptr->entry.npos + res->entry.npos;
res->pos = (WordEntryPos *)
repalloc(res->pos, newlen * sizeof(WordEntryPos));
- memcpy(&res->pos[res->poslen], ptr->pos,
- ptr->poslen * sizeof(WordEntryPos));
- res->poslen = newlen;
+ memcpy(&res->pos[res->entry.npos], ptr->pos,
+ ptr->entry.npos * sizeof(WordEntryPos));
+ res->entry.npos = newlen;
pfree(ptr->pos);
}
else
{
/* just give ptr's positions to pos */
- res->entry.haspos = 1;
+ res->entry.npos = ptr->entry.npos;
res->pos = ptr->pos;
- res->poslen = ptr->poslen;
}
}
ptr++;
}
/* count space needed for last item */
+ if (i % TS_OFFSET_STRIDE == 0)
+ {
+ buflen = INTALIGN(buflen);
+ buflen += sizeof(WordEntry);
+ }
+ else
+ buflen = SHORTALIGN(buflen);
+
buflen += res->entry.len;
- if (res->entry.haspos)
+
+ if (res->entry.npos)
{
- res->poslen = uniquePos(res->pos, res->poslen);
+ res->entry.npos = uniquePos(res->pos, res->entry.npos);
buflen = SHORTALIGN(buflen);
- buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+ buflen += res->entry.npos * sizeof(WordEntryPos);
}
*outbuflen = buflen;
return res + 1 - a;
}
-static int
-WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
-{
- return compareentry(a, b, buf);
-}
-
-
Datum
tsvectorin(PG_FUNCTION_ARGS)
{
@@ -181,7 +207,6 @@ tsvectorin(PG_FUNCTION_ARGS)
WordEntryIN *arr;
int totallen;
int arrlen; /* allocated size of arr */
- WordEntry *inarr;
int len = 0;
TSVector in;
int i;
@@ -189,7 +214,6 @@ tsvectorin(PG_FUNCTION_ARGS)
int toklen;
WordEntryPos *pos;
int poslen;
- char *strbuf;
int stroff;
/*
@@ -238,23 +262,13 @@ tsvectorin(PG_FUNCTION_ARGS)
tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
cur = tmpbuf + dist;
}
+ arr[len].entry.hasoff = 0;
arr[len].entry.len = toklen;
- arr[len].entry.pos = cur - tmpbuf;
+ arr[len].offset = cur - tmpbuf;
+ arr[len].entry.npos = poslen;
+ arr[len].pos = (poslen != 0) ? pos : NULL;
memcpy((void *) cur, (void *) token, toklen);
cur += toklen;
-
- if (poslen != 0)
- {
- arr[len].entry.haspos = 1;
- arr[len].pos = pos;
- arr[len].poslen = poslen;
- }
- else
- {
- arr[len].entry.haspos = 0;
- arr[len].pos = NULL;
- arr[len].poslen = 0;
- }
len++;
}
@@ -273,36 +287,18 @@ tsvectorin(PG_FUNCTION_ARGS)
totallen = CALCDATASIZE(len, buflen);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
- in->size = len;
- inarr = ARRPTR(in);
- strbuf = STRPTR(in);
+ TS_SETCOUNT(in, len);
stroff = 0;
for (i = 0; i < len; i++)
{
- memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
- arr[i].entry.pos = stroff;
- stroff += arr[i].entry.len;
- if (arr[i].entry.haspos)
- {
- if (arr[i].poslen > 0xFFFF)
- elog(ERROR, "positions array too long");
-
- /* Copy number of positions */
- stroff = SHORTALIGN(stroff);
- *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
- stroff += sizeof(uint16);
-
- /* Copy positions */
- memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
- stroff += arr[i].poslen * sizeof(WordEntryPos);
+ tsvector_addlexeme(in, i, &stroff, &tmpbuf[arr[i].offset],
+ arr[i].entry.len, arr[i].pos, arr[i].entry.npos);
+ if (arr[i].entry.npos)
pfree(arr[i].pos);
- }
- inarr[i] = arr[i].entry;
}
- Assert((strbuf + stroff - (char *) in) == totallen);
-
+ Assert((STRPTR(in) + stroff - (char *) in) == totallen);
PG_RETURN_TSVECTOR(in);
}
@@ -313,28 +309,37 @@ tsvectorout(PG_FUNCTION_ARGS)
char *outbuf;
int32 i,
lenbuf = 0,
- pp;
+ pp,
+ tscount = TS_COUNT(out);
+ uint32 pos;
WordEntry *ptr = ARRPTR(out);
char *curbegin,
*curin,
*curout;
- lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
- for (i = 0; i < out->size; i++)
+ lenbuf = tscount * 2 /* '' */ + tscount - 1 /* space */ + 2 /* \0 */ ;
+ for (i = 0; i < tscount; i++)
{
- lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
- if (ptr[i].haspos)
- lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+ int npos = ENTRY_NPOS(out, ptr + i);
+
+ lenbuf += ENTRY_LEN(out, ptr + i) * 2 * pg_database_encoding_max_length() /* for escape */ ;
+ if (npos)
+ lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * npos;
}
curout = outbuf = (char *) palloc(lenbuf);
- for (i = 0; i < out->size; i++)
+
+ INITPOS(pos);
+ for (i = 0; i < tscount; i++)
{
- curbegin = curin = STRPTR(out) + ptr->pos;
+ int lex_len = ENTRY_LEN(out, ptr),
+ npos = ENTRY_NPOS(out, ptr);
+
+ curbegin = curin = STRPTR(out) + pos;
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
- while (curin - curbegin < ptr->len)
+ while (curin - curbegin < lex_len)
{
int len = pg_mblen(curin);
@@ -348,12 +353,12 @@ tsvectorout(PG_FUNCTION_ARGS)
}
*curout++ = '\'';
- if ((pp = POSDATALEN(out, ptr)) != 0)
+ if ((pp = npos) != 0)
{
WordEntryPos *wptr;
*curout++ = ':';
- wptr = POSDATAPTR(out, ptr);
+ wptr = POSDATAPTR(curbegin, lex_len);
while (pp)
{
curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
@@ -379,7 +384,8 @@ tsvectorout(PG_FUNCTION_ARGS)
wptr++;
}
}
- ptr++;
+
+ INCRPTR(out, ptr, pos);
}
*curout = '\0';
@@ -406,35 +412,38 @@ tsvectorsend(PG_FUNCTION_ARGS)
StringInfoData buf;
int i,
j;
+ uint32 pos;
WordEntry *weptr = ARRPTR(vec);
pq_begintypsend(&buf);
+ pq_sendint(&buf, TS_COUNT(vec), sizeof(int32));
- pq_sendint(&buf, vec->size, sizeof(int32));
- for (i = 0; i < vec->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(vec); i++)
{
- uint16 npos;
+ char *lexeme = STRPTR(vec) + pos;
+ int npos = ENTRY_NPOS(vec, weptr),
+ lex_len = ENTRY_LEN(vec, weptr);
/*
* the strings in the TSVector array are not null-terminated, so we
* have to send the null-terminator separately
*/
- pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+ pq_sendtext(&buf, lexeme, lex_len);
pq_sendbyte(&buf, '\0');
-
- npos = POSDATALEN(vec, weptr);
pq_sendint(&buf, npos, sizeof(uint16));
if (npos > 0)
{
- WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+ WordEntryPos *wepptr = POSDATAPTR(lexeme, lex_len);
for (j = 0; j < npos; j++)
pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
}
- weptr++;
+ INCRPTR(vec, weptr, pos);
}
+ PG_FREE_IF_COPY(vec, 0);
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
@@ -443,14 +452,16 @@ tsvectorrecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
TSVector vec;
- int i;
- int32 nentries;
- int datalen; /* number of bytes used in the variable size
+ int i,
+ datalen; /* number of bytes used in the variable size
* area after fixed size TSVector header and
* WordEntries */
+ int32 nentries;
Size hdrlen;
Size len; /* allocated size of vec */
bool needSort = false;
+ char *prev_lexeme = NULL;
+ int prev_lex_len;
nentries = pq_getmsgint(buf, sizeof(int32));
if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
@@ -460,16 +471,17 @@ tsvectorrecv(PG_FUNCTION_ARGS)
len = hdrlen * 2; /* times two to make room for lexemes */
vec = (TSVector) palloc0(len);
- vec->size = nentries;
+ TS_SETCOUNT(vec, nentries);
datalen = 0;
for (i = 0; i < nentries; i++)
{
- const char *lexeme;
+ char *lexeme,
+ *lexeme_out;
uint16 npos;
- size_t lex_len;
+ int lex_len;
- lexeme = pq_getmsgstring(buf);
+ lexeme = (char *) pq_getmsgstring(buf);
npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
/* sanity checks */
@@ -489,62 +501,42 @@ tsvectorrecv(PG_FUNCTION_ARGS)
*
* But make sure the buffer is large enough first.
*/
- while (hdrlen + SHORTALIGN(datalen + lex_len) +
- (npos + 1) * sizeof(WordEntryPos) >= len)
+ while (hdrlen + SHORTALIGN(datalen + lex_len) + sizeof(WordEntry) +
+ npos * sizeof(WordEntryPos) >= len)
{
len *= 2;
vec = (TSVector) repalloc(vec, len);
}
- vec->entries[i].haspos = (npos > 0) ? 1 : 0;
- vec->entries[i].len = lex_len;
- vec->entries[i].pos = datalen;
-
- memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
-
- datalen += lex_len;
-
- if (i > 0 && WordEntryCMP(&vec->entries[i],
- &vec->entries[i - 1],
- STRPTR(vec)) <= 0)
+ if (prev_lexeme && tsCompareString(lexeme, lex_len,
+ prev_lexeme, prev_lex_len, false) <= 0)
needSort = true;
- /* Receive positions */
+ lexeme_out = tsvector_addlexeme(vec, i, &datalen, lexeme,
+ lex_len, NULL, npos);
if (npos > 0)
{
- uint16 j;
WordEntryPos *wepptr;
+ int j;
- /*
- * Pad to 2-byte alignment if necessary. Though we used palloc0
- * for the initial allocation, subsequent repalloc'd memory areas
- * are not initialized to zero.
- */
- if (datalen != SHORTALIGN(datalen))
- {
- *(STRPTR(vec) + datalen) = '\0';
- datalen = SHORTALIGN(datalen);
- }
-
- memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
-
- wepptr = POSDATAPTR(vec, &vec->entries[i]);
+ wepptr = POSDATAPTR(lexeme_out, lex_len);
for (j = 0; j < npos; j++)
{
wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
elog(ERROR, "position information is misordered");
}
-
- datalen += (npos + 1) * sizeof(WordEntry);
}
+
+ prev_lexeme = lexeme;
+ prev_lex_len = lex_len;
}
SET_VARSIZE(vec, hdrlen + datalen);
if (needSort)
- qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
- compareentry, (void *) STRPTR(vec));
+ qsort_arg((void *) ARRPTR(vec), TS_COUNT(vec), sizeof(WordEntry),
+ compareentry, (void *) vec);
PG_RETURN_TSVECTOR(vec);
}
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 822520299e..02e80c4a74 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -33,9 +33,9 @@
typedef struct
{
- WordEntry *arrb;
- WordEntry *arre;
- char *values;
+ TSVector vec;
+ int bidx;
+ int eidx;
char *operand;
} CHKVAL;
@@ -71,7 +71,7 @@ static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
/*
- * Order: haspos, len, word, for all positions (pos, weight)
+ * Order: npos, len, word, for all positions (pos, weight)
*/
static int
silly_cmp_tsvector(const TSVector a, const TSVector b)
@@ -80,9 +80,9 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
return -1;
else if (VARSIZE(a) > VARSIZE(b))
return 1;
- else if (a->size < b->size)
+ else if (TS_COUNT(a) < TS_COUNT(b))
return -1;
- else if (a->size > b->size)
+ else if (TS_COUNT(a) > TS_COUNT(b))
return 1;
else
{
@@ -90,28 +90,40 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
WordEntry *bptr = ARRPTR(b);
int i = 0;
int res;
+ uint32 pos1,
+ pos2;
+ INITPOS(pos1);
+ INITPOS(pos2);
- for (i = 0; i < a->size; i++)
+ for (i = 0; i < TS_COUNT(a); i++)
{
- if (aptr->haspos != bptr->haspos)
- {
- return (aptr->haspos > bptr->haspos) ? -1 : 1;
- }
- else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
+ char *lex1 = STRPTR(a) + pos1,
+ *lex2 = STRPTR(b) + pos2;
+ int npos1 = ENTRY_NPOS(a, aptr),
+ npos2 = ENTRY_NPOS(b, bptr);
+ int len1 = ENTRY_LEN(a, aptr),
+ len2 = ENTRY_LEN(b, bptr);
+
+ if ((npos1 == 0 || npos2 == 0) && npos1 != npos2)
+ return npos1 > npos2 ? -1 : 1;
+ else if ((res = tsCompareString(lex1, len1, lex2, len2, false)) != 0)
{
return res;
}
- else if (aptr->haspos)
+ else if (npos1 > 0)
{
- WordEntryPos *ap = POSDATAPTR(a, aptr);
- WordEntryPos *bp = POSDATAPTR(b, bptr);
+ WordEntryPos *ap,
+ *bp;
int j;
- if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
- return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
+ ap = POSDATAPTR(lex1, len1);
+ bp = POSDATAPTR(lex2, len2);
+
+ if (npos1 != npos2)
+ return (npos1 > npos2) ? -1 : 1;
- for (j = 0; j < POSDATALEN(a, aptr); j++)
+ for (j = 0; j < npos1; j++)
{
if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
{
@@ -125,8 +137,8 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
}
}
- aptr++;
- bptr++;
+ INCRPTR(a, aptr, pos1);
+ INCRPTR(b, bptr, pos2);
}
}
@@ -161,27 +173,29 @@ tsvector_strip(PG_FUNCTION_ARGS)
TSVector in = PG_GETARG_TSVECTOR(0);
TSVector out;
int i,
+ count,
+ posout = 0,
+ pos,
len = 0;
- WordEntry *arrin = ARRPTR(in),
- *arrout;
- char *cur;
+ WordEntry *entryin = ARRPTR(in);
- for (i = 0; i < in->size; i++)
- len += arrin[i].len;
+ count = TS_COUNT(in);
+ for (i = 0; i < count; i++)
+ INCRSIZE(len, i, ENTRY_LEN(in, ARRPTR(in) + i), 0);
- len = CALCDATASIZE(in->size, len);
+ len = CALCDATASIZE(count, len);
out = (TSVector) palloc0(len);
SET_VARSIZE(out, len);
- out->size = in->size;
- arrout = ARRPTR(out);
- cur = STRPTR(out);
- for (i = 0; i < in->size; i++)
+ TS_SETCOUNT(out, count);
+
+ INITPOS(pos);
+ for (i = 0; i < count; i++)
{
- memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
- arrout[i].haspos = 0;
- arrout[i].len = arrin[i].len;
- arrout[i].pos = cur - STRPTR(out);
- cur += arrout[i].len;
+ tsvector_addlexeme(out, i, &posout,
+ STRPTR(in) + pos, ENTRY_LEN(in, entryin),
+ NULL, 0);
+
+ INCRPTR(in, entryin, pos);
}
PG_FREE_IF_COPY(in, 0);
@@ -192,7 +206,7 @@ Datum
tsvector_length(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
- int32 ret = in->size;
+ int32 ret = TS_COUNT(in);
PG_FREE_IF_COPY(in, 0);
PG_RETURN_INT32(ret);
@@ -204,11 +218,10 @@ tsvector_setweight(PG_FUNCTION_ARGS)
TSVector in = PG_GETARG_TSVECTOR(0);
char cw = PG_GETARG_CHAR(1);
TSVector out;
- int i,
- j;
- WordEntry *entry;
- WordEntryPos *p;
+ int i;
+ WordEntry *weptr;
int w = 0;
+ uint32 pos;
switch (cw)
{
@@ -235,20 +248,22 @@ tsvector_setweight(PG_FUNCTION_ARGS)
out = (TSVector) palloc(VARSIZE(in));
memcpy(out, in, VARSIZE(in));
- entry = ARRPTR(out);
- i = out->size;
- while (i--)
+ weptr = ARRPTR(out);
+
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(out); i++)
{
- if ((j = POSDATALEN(out, entry)) != 0)
+ int j,
+ npos = ENTRY_NPOS(out, weptr);
+
+ if (npos)
{
- p = POSDATAPTR(out, entry);
- while (j--)
- {
- WEP_SETWEIGHT(*p, w);
- p++;
- }
+ WordEntryPos *p = POSDATAPTR(STRPTR(out) + pos, ENTRY_LEN(out, weptr));
+
+ for (j = 0; j < npos; j++)
+ WEP_SETWEIGHT(p[j], w);
}
- entry++;
+ INCRPTR(out, weptr, pos);
}
PG_FREE_IF_COPY(in, 0);
@@ -269,10 +284,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
TSVector tsout;
int i,
- j,
nlexemes,
weight;
- WordEntry *entry;
Datum *dlexemes;
bool *nulls;
@@ -301,8 +314,6 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
tsout = (TSVector) palloc(VARSIZE(tsin));
memcpy(tsout, tsin, VARSIZE(tsin));
- entry = ARRPTR(tsout);
-
deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
&dlexemes, &nulls, &nlexemes);
@@ -315,7 +326,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
{
char *lex;
int lex_len,
- lex_pos;
+ lex_idx,
+ npos;
if (nulls[i])
ereport(ERROR,
@@ -324,17 +336,19 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
lex = VARDATA(dlexemes[i]);
lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
- lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+ lex_idx = tsvector_bsearch(tsin, lex, lex_len);
+ npos = ENTRY_NPOS(tsin, ARRPTR(tsout) + lex_idx);
- if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ if (lex_idx >= 0 && npos > 0)
{
- WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ int j;
+ WordEntry *we;
+ char *lexeme = tsvector_getlexeme(tsout, lex_idx, &we);
- while (j--)
- {
- WEP_SETWEIGHT(*p, weight);
- p++;
- }
+ WordEntryPos *p = POSDATAPTR(lexeme, we->len);
+
+ for (j = 0; j < npos; j++)
+ WEP_SETWEIGHT(p[j], weight);
}
}
@@ -354,34 +368,27 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
* Return the number added (might be less than expected due to overflow)
*/
static int32
-add_pos(TSVector src, WordEntry *srcptr,
- TSVector dest, WordEntry *destptr,
+add_pos(char *src, WordEntry *srcptr,
+ WordEntryPos *dest, int from,
int32 maxpos)
{
- uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
+ uint16 clen = from;
int i;
- uint16 slen = POSDATALEN(src, srcptr),
- startlen;
- WordEntryPos *spos = POSDATAPTR(src, srcptr),
- *dpos = POSDATAPTR(dest, destptr);
-
- if (!destptr->haspos)
- *clen = 0;
+ uint16 slen = srcptr->npos;
+ WordEntryPos *spos = POSDATAPTR(src, srcptr->len);
- startlen = *clen;
+ Assert(!srcptr->hasoff);
for (i = 0;
- i < slen && *clen < MAXNUMPOS &&
- (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
+ i < slen && clen < MAXNUMPOS &&
+ (clen == 0 || WEP_GETPOS(dest[clen - 1]) != MAXENTRYPOS - 1);
i++)
{
- WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
- WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
- (*clen)++;
+ WEP_SETWEIGHT(dest[clen], WEP_GETWEIGHT(spos[i]));
+ WEP_SETPOS(dest[clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
+ clen++;
}
- if (*clen != startlen)
- destptr->haspos = 1;
- return *clen - startlen;
+ return clen - from;
}
/*
@@ -392,20 +399,20 @@ add_pos(TSVector src, WordEntry *srcptr,
static int
tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
{
- WordEntry *arrin = ARRPTR(tsv);
int StopLow = 0,
- StopHigh = tsv->size,
+ StopHigh = TS_COUNT(tsv),
StopMiddle,
cmp;
while (StopLow < StopHigh)
{
- StopMiddle = (StopLow + StopHigh) / 2;
+ WordEntry *entry = NULL;
+ char *str;
+ StopMiddle = (StopLow + StopHigh) / 2;
+ str = tsvector_getlexeme(tsv, StopMiddle, &entry);
cmp = tsCompareString(lexeme, lexeme_len,
- STRPTR(tsv) + arrin[StopMiddle].pos,
- arrin[StopMiddle].len,
- false);
+ str, entry->len, false);
if (cmp < 0)
StopHigh = StopMiddle;
@@ -460,14 +467,12 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
int indices_count)
{
TSVector tsout;
- WordEntry *arrin = ARRPTR(tsv),
- *arrout;
- char *data = STRPTR(tsv),
- *dataout;
- int i, /* index in arrin */
- j, /* index in arrout */
+ WordEntry *ptr = ARRPTR(tsv);
+ int i, /* index in input tsvector */
+ j, /* index in output tsvector */
k, /* index in indices_to_delete */
- curoff; /* index in dataout area */
+ curoff = 0, /* index in data area of output */
+ pos;
/*
* Sort the filter array to simplify membership checks below. Also, get
@@ -495,16 +500,18 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
tsout = (TSVector) palloc0(VARSIZE(tsv));
/* This count must be correct because STRPTR(tsout) relies on it. */
- tsout->size = tsv->size - indices_count;
+ TS_SETCOUNT(tsout, TS_COUNT(tsv) - indices_count);
/*
* Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
*/
- arrout = ARRPTR(tsout);
- dataout = STRPTR(tsout);
- curoff = 0;
- for (i = j = k = 0; i < tsv->size; i++)
+
+ INITPOS(pos);
+ for (i = j = k = 0; i < TS_COUNT(tsv); i++)
{
+ char *lex = STRPTR(tsv) + pos;
+ int lex_len = ENTRY_LEN(tsv, ptr);
+
/*
* If current i is present in indices_to_delete, skip this lexeme.
* Since indices_to_delete is already sorted, we only need to check
@@ -513,28 +520,14 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
if (k < indices_count && i == indices_to_delete[k])
{
k++;
- continue;
+ goto next;
}
- /* Copy lexeme and its positions and weights */
- memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
- arrout[j].haspos = arrin[i].haspos;
- arrout[j].len = arrin[i].len;
- arrout[j].pos = curoff;
- curoff += arrin[i].len;
- if (arrin[i].haspos)
- {
- int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
- + sizeof(uint16);
-
- curoff = SHORTALIGN(curoff);
- memcpy(dataout + curoff,
- STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
- len);
- curoff += len;
- }
+ tsvector_addlexeme(tsout, j++, &curoff, lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(tsv, ptr));
- j++;
+next:
+ INCRPTR(tsv, ptr, pos);
}
/*
@@ -543,8 +536,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
* estimation of tsout's size is wrong.
*/
Assert(k == indices_count);
-
- SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), curoff));
return tsout;
}
@@ -637,6 +629,7 @@ tsvector_unnest(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
TSVector tsin;
+ uint32 pos;
if (SRF_IS_FIRSTCALL())
{
@@ -655,31 +648,33 @@ tsvector_unnest(PG_FUNCTION_ARGS)
TEXTARRAYOID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
- funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+ INITPOS(pos);
+ funcctx->user_fctx = list_make2(PG_GETARG_TSVECTOR(0), makeInteger(pos));
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
- tsin = (TSVector) funcctx->user_fctx;
+ tsin = (TSVector) linitial(funcctx->user_fctx);
+ pos = intVal(lsecond(funcctx->user_fctx));
- if (funcctx->call_cntr < tsin->size)
+ if (funcctx->call_cntr < TS_COUNT(tsin))
{
- WordEntry *arrin = ARRPTR(tsin);
+ WordEntry *entry = ARRPTR(tsin) + funcctx->call_cntr;
char *data = STRPTR(tsin);
HeapTuple tuple;
int j,
- i = funcctx->call_cntr;
+ npos = ENTRY_NPOS(tsin, entry),
+ lex_len = ENTRY_LEN(tsin, entry);
bool nulls[] = {false, false, false};
Datum values[3];
values[0] = PointerGetDatum(
- cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
- );
+ cstring_to_text_with_len(data + pos, lex_len));
- if (arrin[i].haspos)
+ if (npos)
{
- WordEntryPosVector *posv;
+ WordEntryPos *apos = POSDATAPTR(data + pos, lex_len);
Datum *positions;
Datum *weights;
char weight;
@@ -689,28 +684,28 @@ tsvector_unnest(PG_FUNCTION_ARGS)
* uint16 (2 bits for weight, 14 for position). Here we extract
* that in two separate arrays.
*/
- posv = _POSVECPTR(tsin, arrin + i);
- positions = palloc(posv->npos * sizeof(Datum));
- weights = palloc(posv->npos * sizeof(Datum));
- for (j = 0; j < posv->npos; j++)
+ positions = palloc(npos * sizeof(Datum));
+ weights = palloc(npos * sizeof(Datum));
+ for (j = 0; j < npos; j++)
{
- positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
- weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ positions[j] = Int16GetDatum(WEP_GETPOS(apos[j]));
+ weight = 'D' - WEP_GETWEIGHT(apos[j]);
weights[j] = PointerGetDatum(
cstring_to_text_with_len(&weight, 1)
);
}
values[1] = PointerGetDatum(
- construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ construct_array(positions, npos, INT2OID, 2, true, 's'));
values[2] = PointerGetDatum(
- construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ construct_array(weights, npos, TEXTOID, -1, false, 'i'));
}
else
{
nulls[1] = nulls[2] = true;
}
+ INCRPTR(tsin, entry, intVal(lsecond(funcctx->user_fctx)));
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
}
@@ -728,27 +723,147 @@ Datum
tsvector_to_array(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0);
- WordEntry *arrin = ARRPTR(tsin);
+ WordEntry *entry = ARRPTR(tsin);
Datum *elements;
int i;
ArrayType *array;
+ long pos;
- elements = palloc(tsin->size * sizeof(Datum));
+ elements = palloc(TS_COUNT(tsin) * sizeof(Datum));
- for (i = 0; i < tsin->size; i++)
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(tsin); i++)
{
elements[i] = PointerGetDatum(
- cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
- );
+ cstring_to_text_with_len(STRPTR(tsin) + pos, ENTRY_LEN(tsin, entry)));
+ INCRPTR(tsin, entry, pos);
}
- array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ array = construct_array(elements, TS_COUNT(tsin), TEXTOID, -1, false, 'i');
pfree(elements);
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(array);
}
+/*
+ * Returns offset by given index in TSVector,
+ * this function used when we need random access
+ */
+int
+tsvector_getoffset(TSVector vec, int idx, WordEntry **we)
+{
+ int offset = 0;
+ WordEntry *entry;
+
+ entry = ARRPTR(vec) + idx;
+ if (we)
+ *we = entry;
+
+ while (!entry->hasoff)
+ {
+ entry--;
+ if (!entry->hasoff)
+ offset += SHORTALIGN(entry->len) + entry->npos * sizeof(WordEntryPos);
+ }
+
+ Assert(entry >= ARRPTR(vec));
+
+ if (idx % TS_OFFSET_STRIDE)
+ {
+ /* if idx is by offset */
+ WordEntry *offset_entry = (WordEntry *) (STRPTR(vec) + entry->offset);
+
+ offset += entry->offset + sizeof(WordEntry);
+ offset += SHORTALIGN(offset_entry->len) + offset_entry->npos * sizeof(WordEntryPos);
+ }
+ else
+ {
+ Assert(entry == ARRPTR(vec) + idx);
+
+ if (we)
+ *we = (WordEntry *) (STRPTR(vec) + entry->offset);
+ offset = entry->offset + sizeof(WordEntry);
+ }
+
+ return offset;
+}
+
+/*
+ * Add lexeme and its positions to tsvector and move dataoff (offset where
+ * data should be added) to new position.
+ * Returns pointer to lexeme start
+ */
+char *
+tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+ char *lexeme, int lexeme_len, WordEntryPos *pos, int npos)
+{
+ int stroff;
+ WordEntry *entry;
+ char *result;
+
+ /* when idx is 0, dataoff should be 0 too, and otherwise */
+ Assert(!((idx == 0) ^ (*dataoff == 0)));
+
+ stroff = *dataoff;
+ entry = ARRPTR(tsv) + idx;
+
+ if (idx % TS_OFFSET_STRIDE == 0)
+ {
+ /* WordEntry with offset */
+ WordEntry offentry;
+
+ stroff = INTALIGN(stroff);
+ entry->hasoff = 1;
+ entry->offset = stroff;
+
+ /* fill WordEntry for offset */
+ offentry.hasoff = 0;
+ offentry.len = lexeme_len;
+ offentry.npos = npos;
+ memcpy(STRPTR(tsv) + stroff, &offentry, sizeof(WordEntry));
+ stroff += sizeof(WordEntry);
+ }
+ else
+ {
+ stroff = SHORTALIGN(stroff);
+ entry->hasoff = 0;
+ entry->len = lexeme_len;
+ entry->npos = npos;
+ }
+
+ memcpy(STRPTR(tsv) + stroff, lexeme, lexeme_len);
+ result = STRPTR(tsv) + stroff;
+ stroff += lexeme_len;
+
+ if (npos)
+ {
+ if (npos > 0xFFFF)
+ elog(ERROR, "positions array too long");
+
+ /*
+ * Pad to 2-byte alignment if necessary. We don't know how memory was
+ * allocated, so in case of aligning we need to make sure that unused
+ * is zero.
+ */
+ if (stroff != SHORTALIGN(stroff))
+ {
+ *(STRPTR(tsv) + stroff) = '\0';
+ stroff = SHORTALIGN(stroff);
+ }
+
+ /* Copy positions */
+ if (pos)
+ memcpy(STRPTR(tsv) + stroff, pos, npos * sizeof(WordEntryPos));
+
+ stroff += npos * sizeof(WordEntryPos);
+ }
+
+ *dataoff = stroff;
+
+ return result;
+}
+
/*
* Build tsvector from array of lexemes.
*/
@@ -758,14 +873,13 @@ array_to_tsvector(PG_FUNCTION_ARGS)
ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
TSVector tsout;
Datum *dlexemes;
- WordEntry *arrout;
bool *nulls;
int nitems,
i,
j,
tslen,
+ cur = 0,
datalen = 0;
- char *cur;
deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
@@ -793,26 +907,24 @@ array_to_tsvector(PG_FUNCTION_ARGS)
/* Calculate space needed for surviving lexemes. */
for (i = 0; i < nitems; i++)
- datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
+ {
+ int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+
+ INCRSIZE(datalen, i, lex_len, 0);
+ }
tslen = CALCDATASIZE(nitems, datalen);
/* Allocate and fill tsvector. */
tsout = (TSVector) palloc0(tslen);
SET_VARSIZE(tsout, tslen);
- tsout->size = nitems;
+ TS_SETCOUNT(tsout, nitems);
- arrout = ARRPTR(tsout);
- cur = STRPTR(tsout);
for (i = 0; i < nitems; i++)
{
char *lex = VARDATA(dlexemes[i]);
int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
- memcpy(cur, lex, lex_len);
- arrout[i].haspos = 0;
- arrout[i].len = lex_len;
- arrout[i].pos = cur - STRPTR(tsout);
- cur += lex_len;
+ tsvector_addlexeme(tsout, i, &cur, lex, lex_len, NULL, 0);
}
PG_FREE_IF_COPY(v, 0);
@@ -828,17 +940,16 @@ tsvector_filter(PG_FUNCTION_ARGS)
TSVector tsin = PG_GETARG_TSVECTOR(0),
tsout;
ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
- WordEntry *arrin = ARRPTR(tsin),
- *arrout;
- char *datain = STRPTR(tsin),
- *dataout;
+ char *dataout;
Datum *dweights;
bool *nulls;
int nweights;
int i,
- j;
- int cur_pos = 0;
+ j,
+ dataoff = 0,
+ pos;
char mask = 0;
+ WordEntry *ptr = ARRPTR(tsin);
deconstruct_array(weights, CHAROID, 1, true, 'c',
&dweights, &nulls, &nweights);
@@ -879,109 +990,112 @@ tsvector_filter(PG_FUNCTION_ARGS)
}
tsout = (TSVector) palloc0(VARSIZE(tsin));
- tsout->size = tsin->size;
- arrout = ARRPTR(tsout);
+ TS_SETCOUNT(tsout, TS_COUNT(tsin));
dataout = STRPTR(tsout);
- for (i = j = 0; i < tsin->size; i++)
+ INITPOS(pos);
+ for (i = j = 0; i < TS_COUNT(tsin); i++)
{
- WordEntryPosVector *posvin,
- *posvout;
- int npos = 0;
- int k;
-
- if (!arrin[i].haspos)
- continue;
-
- posvin = _POSVECPTR(tsin, arrin + i);
- posvout = (WordEntryPosVector *)
- (dataout + SHORTALIGN(cur_pos + arrin[i].len));
-
- for (k = 0; k < posvin->npos; k++)
+ WordEntryPos *posin,
+ *posout;
+ int k,
+ npos = 0,
+ lex_len = ENTRY_LEN(tsin, ptr);
+ char *lex = STRPTR(tsin) + pos,
+ *lexout;
+
+ posin = POSDATAPTR(lex, lex_len);
+ for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
{
- if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
- posvout->pos[npos++] = posvin->pos[k];
+ if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+ npos++;
}
- /* if no satisfactory positions found, skip lexeme */
if (!npos)
- continue;
+ goto next;
- arrout[j].haspos = true;
- arrout[j].len = arrin[i].len;
- arrout[j].pos = cur_pos;
+ lexout = tsvector_addlexeme(tsout, j++, &dataoff, lex, lex_len,
+ NULL, npos);
+ posout = POSDATAPTR(lexout, lex_len);
+ npos = 0;
+ for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+ posout[npos++] = posin[k];
+ }
- memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
- posvout->npos = npos;
- cur_pos += SHORTALIGN(arrin[i].len);
- cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
- sizeof(uint16);
- j++;
+next:
+ INCRPTR(tsin, ptr, pos);
}
- tsout->size = j;
+ TS_SETCOUNT(tsout, j);
if (dataout != STRPTR(tsout))
- memmove(STRPTR(tsout), dataout, cur_pos);
+ memmove(STRPTR(tsout), dataout, dataoff);
- SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+ SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), dataoff));
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(tsout);
}
+/* Get max position in in1; we'll need this to offset in2's positions */
+static int
+get_maxpos(TSVector tsv)
+{
+ int i,
+ j,
+ maxpos = 0;
+ WordEntry *ptr = ARRPTR(tsv);
+ uint32 pos;
+ WordEntryPos *apos;
+
+ INITPOS(pos);
+ for (i = 0; i < TS_COUNT(tsv); i++)
+ {
+ apos = POSDATAPTR(STRPTR(tsv) + pos, ENTRY_LEN(tsv, ptr));
+ for (j = 0; j < ENTRY_NPOS(tsv, ptr); j++)
+ {
+ if (WEP_GETPOS(apos[j]) > maxpos)
+ maxpos = WEP_GETPOS(apos[j]);
+ }
+
+ INCRPTR(tsv, ptr, pos);
+ }
+
+ return maxpos;
+}
+
Datum
tsvector_concat(PG_FUNCTION_ARGS)
{
- TSVector in1 = PG_GETARG_TSVECTOR(0);
- TSVector in2 = PG_GETARG_TSVECTOR(1);
- TSVector out;
- WordEntry *ptr;
- WordEntry *ptr1,
+ TSVector in1 = PG_GETARG_TSVECTOR(0),
+ in2 = PG_GETARG_TSVECTOR(1),
+ out;
+ WordEntry *ptr,
+ *ptr1,
*ptr2;
- WordEntryPos *p;
int maxpos = 0,
i,
- j,
i1,
i2,
- dataoff,
output_bytes,
- output_size;
- char *data,
- *data1,
- *data2;
-
- /* Get max position in in1; we'll need this to offset in2's positions */
- ptr = ARRPTR(in1);
- i = in1->size;
- while (i--)
- {
- if ((j = POSDATALEN(in1, ptr)) != 0)
- {
- p = POSDATAPTR(in1, ptr);
- while (j--)
- {
- if (WEP_GETPOS(*p) > maxpos)
- maxpos = WEP_GETPOS(*p);
- p++;
- }
- }
- ptr++;
- }
+ pos1,
+ pos2,
+ dataoff;
+ char *data;
ptr1 = ARRPTR(in1);
ptr2 = ARRPTR(in2);
- data1 = STRPTR(in1);
- data2 = STRPTR(in2);
- i1 = in1->size;
- i2 = in2->size;
+ i1 = TS_COUNT(in1);
+ i2 = TS_COUNT(in2);
/*
* Conservative estimate of space needed. We might need all the data in
- * both inputs, and conceivably add a pad byte before position data for
- * each item where there was none before.
+ * both inputs, and conceivably add a pad bytes before lexeme and position
+ * data, and pad bytes before WordEntry for offset entry.
*/
- output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
+ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 * 2 + i2 * 2;
+ output_bytes += 4 * (i1 + i2) / TS_OFFSET_STRIDE;
out = (TSVector) palloc0(output_bytes);
SET_VARSIZE(out, output_bytes);
@@ -990,91 +1104,110 @@ tsvector_concat(PG_FUNCTION_ARGS)
* We must make out->size valid so that STRPTR(out) is sensible. We'll
* collapse out any unused space at the end.
*/
- out->size = in1->size + in2->size;
+ TS_SETCOUNT(out, i1 + i2);
- ptr = ARRPTR(out);
+ ptr = NULL;
data = STRPTR(out);
+ i = 0;
dataoff = 0;
+
+ INITPOS(pos1);
+ INITPOS(pos2);
+
+ /*
+ * we will need max position from first tsvector to add it positions of
+ * second tsvector
+ */
+ maxpos = get_maxpos(in1);
+
while (i1 && i2)
{
- int cmp = compareEntry(data1, ptr1, data2, ptr2);
+ char *lex = STRPTR(in1) + pos1,
+ *lex2 = STRPTR(in2) + pos2;
+
+ int lex_len = ENTRY_LEN(in1, ptr1),
+ lex2_len = ENTRY_LEN(in2, ptr2);
+
+ int cmp = tsCompareString(lex, lex_len, lex2, lex2_len, false);
if (cmp < 0)
{ /* in1 first */
- ptr->haspos = ptr1->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- }
+ tsvector_addlexeme(out, i, &dataoff,
+ lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
- ptr++;
- ptr1++;
+ INCRPTR(in1, ptr1, pos1);
i1--;
+ i++;
}
else if (cmp > 0)
{ /* in2 first */
- ptr->haspos = ptr2->haspos;
- ptr->len = ptr2->len;
- memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
- ptr->pos = dataoff;
- dataoff += ptr2->len;
- if (ptr->haspos)
+ char *new_lex;
+ WordEntry *we = UNWRAP_ENTRY(in2, ptr2);
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex2, lex2_len, NULL, 0);
+ if (we->npos > 0)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ int addlen;
+ WordEntryPos *apos = POSDATAPTR(new_lex, lex2_len);
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ addlen = add_pos(lex2, we, apos, 0, maxpos);
+ if (addlen > 0)
{
+ ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+ ptr->npos = addlen;
dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff += ptr->npos * sizeof(WordEntryPos);
}
}
- ptr++;
- ptr2++;
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i2--;
}
else
{
- ptr->haspos = ptr1->haspos | ptr2->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
+ char *new_lex;
+ int npos1 = ENTRY_NPOS(in1, ptr1),
+ npos2 = ENTRY_NPOS(in2, ptr2);
+ WordEntryPos *apos;
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+ apos = POSDATAPTR(new_lex, lex_len);
+
+ if (npos1 || npos2)
{
- if (ptr1->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- if (ptr2->haspos)
- dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
- }
- else /* must have ptr2->haspos */
+ int addlen;
+ char *lex2 = STRPTR(in2) + pos2;
+
+ ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+ if (npos1)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ /* add positions from left tsvector */
+ addlen = add_pos(lex, UNWRAP_ENTRY(in1, ptr1), apos, 0, 0);
+ ptr->npos = addlen;
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ if (npos2)
{
- dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ /* add positions from right right tsvector */
+ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, addlen, maxpos);
+ ptr->npos += addlen;
}
}
+ else /* npos in second should be > 0 */
+ {
+ /* add positions from right tsvector */
+ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+ ptr->npos = addlen;
+ }
+
+ dataoff = SHORTALIGN(dataoff);
+ dataoff += ptr->npos * sizeof(WordEntryPos);
}
- ptr++;
- ptr1++;
- ptr2++;
+ INCRPTR(in1, ptr1, pos1);
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i1--;
i2--;
}
@@ -1082,45 +1215,44 @@ tsvector_concat(PG_FUNCTION_ARGS)
while (i1)
{
- ptr->haspos = ptr1->haspos;
- ptr->len = ptr1->len;
- memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
- ptr->pos = dataoff;
- dataoff += ptr1->len;
- if (ptr->haspos)
- {
- dataoff = SHORTALIGN(dataoff);
- memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
- dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
- }
+ char *lex = STRPTR(in1) + pos1;
+ int lex_len = ENTRY_LEN(in1, ptr1);
- ptr++;
- ptr1++;
+ tsvector_addlexeme(out, i, &dataoff,
+ lex, lex_len,
+ POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
+
+ INCRPTR(in1, ptr1, pos1);
+ i++;
i1--;
}
while (i2)
{
- ptr->haspos = ptr2->haspos;
- ptr->len = ptr2->len;
- memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
- ptr->pos = dataoff;
- dataoff += ptr2->len;
- if (ptr->haspos)
+ char *lex = STRPTR(in2) + pos2,
+ *new_lex;
+ int lex_len = ENTRY_LEN(in2, ptr2),
+ npos = ENTRY_NPOS(in2, ptr2);
+
+ new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+ if (npos > 0)
{
- int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+ int addlen;
+ WordEntryPos *apos = POSDATAPTR(new_lex, lex_len);
- if (addlen == 0)
- ptr->haspos = 0;
- else
+ addlen = add_pos(lex, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+ if (addlen > 0)
{
+ WordEntry *ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+
+ ptr->npos = addlen;
dataoff = SHORTALIGN(dataoff);
- dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+ dataoff += npos * sizeof(WordEntryPos);
}
}
- ptr++;
- ptr2++;
+ INCRPTR(in2, ptr2, pos2);
+ i++;
i2--;
}
@@ -1137,12 +1269,10 @@ tsvector_concat(PG_FUNCTION_ARGS)
* Adjust sizes (asserting that we didn't overrun the original estimates)
* and collapse out any unused array entries.
*/
- output_size = ptr - ARRPTR(out);
- Assert(output_size <= out->size);
- out->size = output_size;
+ TS_SETCOUNT(out, i);
if (data != STRPTR(out))
memmove(STRPTR(out), data, dataoff);
- output_bytes = CALCDATASIZE(out->size, dataoff);
+ output_bytes = CALCDATASIZE(TS_COUNT(out), dataoff);
Assert(output_bytes <= VARSIZE(out));
SET_VARSIZE(out, output_bytes);
@@ -1194,35 +1324,26 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
* Check weight info or/and fill 'data' with the required positions
*/
static bool
-checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
+checkclass_str(WordEntryPos *pv, int npos, QueryOperand *val,
ExecPhraseData *data)
{
bool result = false;
- if (entry->haspos && (val->weight || data))
+ if (npos && (val->weight || data))
{
- WordEntryPosVector *posvec;
-
- /*
- * We can't use the _POSVECPTR macro here because the pointer to the
- * tsvector's lexeme storage is already contained in chkval->values.
- */
- posvec = (WordEntryPosVector *)
- (chkval->values + SHORTALIGN(entry->pos + entry->len));
-
if (val->weight && data)
{
- WordEntryPos *posvec_iter = posvec->pos;
+ WordEntryPos *posvec_iter = pv;
WordEntryPos *dptr;
/*
* Filter position information by weights
*/
- dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
+ dptr = data->pos = palloc(sizeof(WordEntryPos) * npos);
data->allocated = true;
/* Is there a position with a matching weight? */
- while (posvec_iter < posvec->pos + posvec->npos)
+ while (posvec_iter < (pv + npos))
{
/* If true, append this position to the data->pos */
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
@@ -1241,10 +1362,10 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
}
else if (val->weight)
{
- WordEntryPos *posvec_iter = posvec->pos;
+ WordEntryPos *posvec_iter = pv;
/* Is there a position with a matching weight? */
- while (posvec_iter < posvec->pos + posvec->npos)
+ while (posvec_iter < (pv + npos))
{
if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
{
@@ -1257,8 +1378,8 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
}
else /* data != NULL */
{
- data->npos = posvec->npos;
- data->pos = posvec->pos;
+ data->npos = npos;
+ data->pos = pv;
data->allocated = false;
result = true;
}
@@ -1311,26 +1432,32 @@ static bool
checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
CHKVAL *chkval = (CHKVAL *) checkval;
- WordEntry *StopLow = chkval->arrb;
- WordEntry *StopHigh = chkval->arre;
- WordEntry *StopMiddle = StopHigh;
+ int StopLow = chkval->bidx;
+ int StopHigh = chkval->eidx;
+ int StopMiddle = StopHigh;
int difference = -1;
bool res = false;
+ char *lexeme;
+ WordEntry *entry;
/* Loop invariant: StopLow <= val < StopHigh */
while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+ Assert(!entry->hasoff);
difference = tsCompareString(chkval->operand + val->distance,
val->length,
- chkval->values + StopMiddle->pos,
- StopMiddle->len,
+ lexeme,
+ entry->len,
false);
if (difference == 0)
{
/* Check weight info & fill 'data' with positions */
- res = checkclass_str(chkval, StopMiddle, val, data);
+ res = checkclass_str(POSDATAPTR(lexeme, entry->len),
+ entry->npos, val, data);
break;
}
else if (difference > 0)
@@ -1352,19 +1479,31 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
if (StopLow >= StopHigh)
StopMiddle = StopHigh;
- while ((!res || data) && StopMiddle < chkval->arre &&
- tsCompareString(chkval->operand + val->distance,
- val->length,
- chkval->values + StopMiddle->pos,
- StopMiddle->len,
- true) == 0)
+ while ((!res || data) && StopMiddle < chkval->eidx)
{
+ char *lexeme;
+ int cmp;
+ WordEntryPos *pv;
+
+ lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+ Assert(!entry->hasoff);
+ pv = POSDATAPTR(lexeme, entry->len);
+ cmp = tsCompareString(chkval->operand + val->distance,
+ val->length,
+ lexeme,
+ entry->len,
+ true);
+
+ if (cmp != 0)
+ break;
+
if (data)
{
/*
* We need to join position information
*/
- res = checkclass_str(chkval, StopMiddle, val, data);
+ res = checkclass_str(pv, entry->npos, val, data);
if (res)
{
@@ -1388,7 +1527,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
}
else
{
- res = checkclass_str(chkval, StopMiddle, val, NULL);
+ res = checkclass_str(pv, entry->npos, val, NULL);
}
StopMiddle++;
@@ -1935,9 +2074,9 @@ ts_match_vq(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(false);
}
- chkval.arrb = ARRPTR(val);
- chkval.arre = chkval.arrb + val->size;
- chkval.values = STRPTR(val);
+ chkval.bidx = 0;
+ chkval.eidx = TS_COUNT(val);
+ chkval.vec = val;
chkval.operand = GETOPERAND(query);
result = TS_execute(GETQUERY(query),
&chkval,
@@ -2001,12 +2140,15 @@ ts_match_tq(PG_FUNCTION_ARGS)
* that have a weight equal to one of the weights in 'weight' bitmask.
*/
static int
-check_weight(TSVector txt, WordEntry *wptr, int8 weight)
+check_weight(char *lexeme, WordEntry *wptr, int8 weight)
{
- int len = POSDATALEN(txt, wptr);
+ int len;
int num = 0;
- WordEntryPos *ptr = POSDATAPTR(txt, wptr);
+ WordEntryPos *ptr;
+ Assert(!wptr->hasoff);
+ len = wptr->len;
+ ptr = POSDATAPTR(lexeme, len);
while (len--)
{
if (weight & (1 << WEP_GETWEIGHT(*ptr)))
@@ -2017,31 +2159,34 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight)
}
#define compareStatWord(a,e,t) \
- tsCompareString((a)->lexeme, (a)->lenlexeme, \
- STRPTR(t) + (e)->pos, (e)->len, \
- false)
+ (tsCompareString((a)->lexeme, (a)->lenlexeme, \
+ t, (e)->len, false))
static void
insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
{
- WordEntry *we = ARRPTR(txt) + off;
+ WordEntry *we;
StatEntry *node = stat->root,
*pnode = NULL;
int n,
res = 0;
uint32 depth = 1;
+ char *lexeme;
+
+ lexeme = tsvector_getlexeme(txt, off, &we);
+ Assert(!we->hasoff);
if (stat->weight == 0)
- n = (we->haspos) ? POSDATALEN(txt, we) : 1;
+ n = (we->npos) ? we->npos : 1;
else
- n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
+ n = (we->npos) ? check_weight(lexeme, we, stat->weight) : 0;
if (n == 0)
return; /* nothing to insert */
while (node)
{
- res = compareStatWord(node, we, txt);
+ res = compareStatWord(node, we, lexeme);
if (res == 0)
{
@@ -2065,7 +2210,7 @@ insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector tx
node->ndoc = 1;
node->nentry = n;
node->lenlexeme = we->len;
- memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
+ memcpy(node->lexeme, lexeme, node->lenlexeme);
if (pnode == NULL)
{
@@ -2092,13 +2237,14 @@ chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVecto
uint32 low, uint32 high, uint32 offset)
{
uint32 pos;
- uint32 middle = (low + high) >> 1;
+ uint32 middle = (low + high) >> 1,
+ count = TS_COUNT(txt);
pos = (low + middle) >> 1;
- if (low != middle && pos >= offset && pos - offset < txt->size)
+ if (low != middle && pos >= offset && pos - offset < count)
insertStatEntry(persistentContext, stat, txt, pos - offset);
pos = (high + middle + 1) >> 1;
- if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
+ if (middle + 1 != high && pos >= offset && pos - offset < count)
insertStatEntry(persistentContext, stat, txt, pos - offset);
if (low != middle)
@@ -2125,7 +2271,8 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
TSVector txt = DatumGetTSVector(data);
uint32 i,
nbit = 0,
- offset;
+ offset,
+ count = TS_COUNT(txt);
if (stat == NULL)
{ /* Init in first */
@@ -2134,19 +2281,19 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
}
/* simple check of correctness */
- if (txt == NULL || txt->size == 0)
+ if (txt == NULL || count == 0)
{
if (txt && txt != (TSVector) DatumGetPointer(data))
pfree(txt);
return stat;
}
- i = txt->size - 1;
+ i = count - 1;
for (; i > 0; i >>= 1)
nbit++;
nbit = 1 << nbit;
- offset = (nbit - txt->size) / 2;
+ offset = (nbit - count) / 2;
insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
@@ -2579,15 +2726,28 @@ tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
}
/* make tsvector value */
- datum = TSVectorGetDatum(make_tsvector(&prs));
- isnull = false;
-
- /* and insert it into tuple */
- rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
- 1, &tsvector_attr_num,
- &datum, &isnull);
-
- pfree(DatumGetPointer(datum));
+ if (prs.curwords)
+ {
+ datum = PointerGetDatum(make_tsvector(&prs));
+ isnull = false;
+ rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+ 1, &tsvector_attr_num,
+ &datum, &isnull);
+ pfree(DatumGetPointer(datum));
+ }
+ else
+ {
+ TSVector out = palloc(CALCDATASIZE(0, 0));
+
+ SET_VARSIZE(out, CALCDATASIZE(0, 0));
+ TS_SETCOUNT(out, 0);
+ datum = PointerGetDatum(out);
+ isnull = false;
+ rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+ 1, &tsvector_attr_num,
+ &datum, &isnull);
+ pfree(prs.words);
+ }
return PointerGetDatum(rettuple);
}
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 30d7c4bccd..eb94c595f2 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -24,30 +24,40 @@
* 2) int32 size - number of lexemes (WordEntry array entries)
* 3) Array of WordEntry - one per lexeme; must be sorted according to
* tsCompareString() (ie, memcmp of lexeme strings).
- * WordEntry->pos gives the number of bytes from end of WordEntry
- * array to start of lexeme's string, which is of length len.
+ * WordEntry have two types: offset or metadata (length of lexeme and number
+ * of positions). If it has offset then metadata will be by this offset.
* 4) Per-lexeme data storage:
- * lexeme string (not null-terminated)
- * if haspos is true:
+ * [4-byte aligned WordEntry] (if its WordEntry has offset)
+ * 2-byte aligned lexeme string (not null-terminated)
+ * if it has positions:
* padding byte if necessary to make the position data 2-byte aligned
- * uint16 number of positions that follow
* WordEntryPos[] positions
*
* The positions for each lexeme must be sorted.
*
- * Note, tsvectorsend/recv believe that sizeof(WordEntry) == 4
+ * Note, tsvector functions believe that sizeof(WordEntry) == 4
*/
-typedef struct
+#define TS_OFFSET_STRIDE 4
+
+typedef union
{
- uint32
- haspos:1,
- len:11, /* MAX 2Kb */
- pos:20; /* MAX 1Mb */
+ struct
+ {
+ uint32 hasoff:1,
+ offset:31;
+ };
+ struct
+ {
+ uint32 hasoff_:1,
+ len:11,
+ npos:16,
+ _unused:4;
+ };
} WordEntry;
#define MAXSTRLEN ( (1<<11) - 1)
-#define MAXSTRPOS ( (1<<20) - 1)
+#define MAXSTRPOS ( (1<<30) - 1)
extern int compareWordEntryPos(const void *a, const void *b);
@@ -62,19 +72,6 @@ extern int compareWordEntryPos(const void *a, const void *b);
typedef uint16 WordEntryPos;
-typedef struct
-{
- uint16 npos;
- WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
-} WordEntryPosVector;
-
-/* WordEntryPosVector with exactly 1 entry */
-typedef struct
-{
- uint16 npos;
- WordEntryPos pos[1];
-} WordEntryPosVector1;
-
#define WEP_GETWEIGHT(x) ( (x) >> 14 )
#define WEP_GETPOS(x) ( (x) & 0x3fff )
@@ -90,13 +87,17 @@ typedef struct
typedef struct
{
int32 vl_len_; /* varlena header (do not touch directly!) */
- int32 size;
+ int32 size_; /* flags and lexemes count */
WordEntry entries[FLEXIBLE_ARRAY_MEMBER];
/* lexemes follow the entries[] array */
} TSVectorData;
typedef TSVectorData *TSVector;
+#define TS_FLAG_STRETCHED 0x80000000
+#define TS_COUNT(t) ((t)->size_ & 0x0FFFFFFF)
+#define TS_SETCOUNT(t,c) ((t)->size_ = (c) | TS_FLAG_STRETCHED)
+
#define DATAHDRSIZE (offsetof(TSVectorData, entries))
#define CALCDATASIZE(nentries, lenstr) (DATAHDRSIZE + (nentries) * sizeof(WordEntry) + (lenstr) )
@@ -104,24 +105,65 @@ typedef TSVectorData *TSVector;
#define ARRPTR(x) ( (x)->entries )
/* pointer to start of a tsvector's lexeme storage */
-#define STRPTR(x) ( (char *) &(x)->entries[(x)->size] )
+#define STRPTR(x) ( (char *) &(x)->entries[TS_COUNT(x)] )
-#define _POSVECPTR(x, e) ((WordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
-#define POSDATALEN(x,e) ( ( (e)->haspos ) ? (_POSVECPTR(x,e)->npos) : 0 )
-#define POSDATAPTR(x,e) (_POSVECPTR(x,e)->pos)
+/* for WordEntry with offset return its WordEntry with other properties */
+#define UNWRAP_ENTRY(x,we) \
+ ((we)->hasoff? (WordEntry *)(STRPTR(x) + (we)->offset): (we))
+
+/*
+ * helpers used when we're not sure that WordEntry
+ * contains ether offset or len
+ */
+#define ENTRY_NPOS(x,we) (UNWRAP_ENTRY(x,we)->npos)
+#define ENTRY_LEN(x,we) (UNWRAP_ENTRY(x,we)->len)
+
+/* pointer to start of positions */
+#define POSDATAPTR(lex, len) ((WordEntryPos *) (lex + SHORTALIGN(len)))
+
+/* set default offset in tsvector data */
+#define INITPOS(p) ((p) = sizeof(WordEntry))
+
+/* increment entry and offset by given WordEntry */
+#define INCRPTR(x,w,p) \
+do { \
+ WordEntry *y = (w); \
+ if ((w)->hasoff) \
+ { \
+ y = (WordEntry *) (STRPTR(x) + (w)->offset); \
+ (p) = (w)->offset + sizeof(WordEntry); \
+ } \
+ (w)++; \
+ Assert(!y->hasoff); \
+ (p) += SHORTALIGN(y->len) + y->npos * sizeof(WordEntryPos); \
+ if ((w) - ARRPTR(x) < TS_COUNT(x) && w->hasoff) \
+ (p) = INTALIGN(p) + sizeof(WordEntry); \
+} while (0);
+
+/* used to calculate tsvector size in in tsvector constructors */
+#define INCRSIZE(s,i,l,n) /* size,index,len,npos */ \
+do { \
+ if ((i) % TS_OFFSET_STRIDE == 0) \
+ (s) = INTALIGN(s) + sizeof(WordEntry); \
+ else \
+ (s) = SHORTALIGN(s); \
+ (s) += (l); \
+ (s) = (n)? SHORTALIGN(s) + (n) * sizeof(WordEntryPos) : (s); \
+} while (0);
/*
* fmgr interface macros
*/
-#define DatumGetTSVector(X) ((TSVector) PG_DETOAST_DATUM(X))
-#define DatumGetTSVectorCopy(X) ((TSVector) PG_DETOAST_DATUM_COPY(X))
+TSVector tsvector_upgrade(Datum orig, bool copy);
+
+#define DatumGetTSVector(X) tsvector_upgrade((X), false)
+#define DatumGetTSVectorCopy(X) tsvector_upgrade((X), true)
#define TSVectorGetDatum(X) PointerGetDatum(X)
#define PG_GETARG_TSVECTOR(n) DatumGetTSVector(PG_GETARG_DATUM(n))
#define PG_GETARG_TSVECTOR_COPY(n) DatumGetTSVectorCopy(PG_GETARG_DATUM(n))
#define PG_RETURN_TSVECTOR(x) return TSVectorGetDatum(x)
-
/*
* TSQuery
*
@@ -239,4 +281,22 @@ typedef TSQueryData *TSQuery;
#define PG_GETARG_TSQUERY_COPY(n) DatumGetTSQueryCopy(PG_GETARG_DATUM(n))
#define PG_RETURN_TSQUERY(x) return TSQueryGetDatum(x)
+int tsvector_getoffset(TSVector vec, int idx, WordEntry **we);
+char *tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+ char *lexeme, int lexeme_len, WordEntryPos *pos, int npos);
+
+/* Returns lexeme and its entry by given index from TSVector */
+inline static char *
+tsvector_getlexeme(TSVector vec, int idx, WordEntry **we)
+{
+ Assert(idx >= 0 && idx < TS_COUNT(vec));
+
+ /*
+ * we do not allow we == NULL because returned lexeme is not \0 ended, and
+ * always should be used with we->len
+ */
+ Assert(we != NULL);
+ return STRPTR(vec) + tsvector_getoffset(vec, idx, we);
+}
+
#endif /* _PG_TSTYPE_H_ */
On Thu, 10 Aug 2017 18:06:17 +0300
Alexander Korotkov <a.korotkov@postgrespro.ru> wrote:
On Wed, Aug 9, 2017 at 7:38 PM, Robert Haas <robertmhaas@gmail.com>
wrote:On Tue, Aug 1, 2017 at 4:00 PM, Ildus K
<i.kurbangaliev@postgrespro.ru> wrote:It's a workaround. DatumGetTSVector and
DatumGetTSVectorCopy will upgrade tsvector on the fly if it
has old format.Hmm, that seems like a real fix, not just a workaround. If you can
transparently read the old format, there's no problem. Not sure
about performance, though.+1
Ildus, I think we need to benchmark reading of the old format. There
would be tradeoff between performance of old format reading and
amount of extra code needed. Once we will have benchmarks we can
consider whether this is the solution we would like to buy.
In my benchmarks when database fits into buffers (so it's measurement of
the time required for the tsvectors conversion) it gives me these
results:
Without conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:04:44 Number of connections: 4
2017/08/17 12:04:44 Database: test1
2017/08/17 12:09:44 Processed: 51419
With conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:14:31 Number of connections: 4
2017/08/17 12:14:31 Database: test1
2017/08/17 12:19:31 Processed: 43607
I ran a bunch of these tests, and these results are stable on my
machine. So in these specific tests performance regression about 15%.
Same time I think this could be the worst case, because usually data
is on disk and conversion will not affect so much to performance.
--
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi,
On 08/17/2017 12:23 PM, Ildus Kurbangaliev wrote:
In my benchmarks when database fits into buffers (so it's measurement of
the time required for the tsvectors conversion) it gives me these
results:Without conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:04:44 Number of connections: 4
2017/08/17 12:04:44 Database: test1
2017/08/17 12:09:44 Processed: 51419With conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:14:31 Number of connections: 4
2017/08/17 12:14:31 Database: test1
2017/08/17 12:19:31 Processed: 43607I ran a bunch of these tests, and these results are stable on my
machine. So in these specific tests performance regression about 15%.Same time I think this could be the worst case, because usually data
is on disk and conversion will not affect so much to performance.
That seems like a fairly significant regression, TBH. I don't quite
agree we can simply assume in-memory workloads don't matter, plenty of
databases have 99% cache hit ratio (particularly when considering not
just shared buffers, but also page cache).
Can you share the benchmarks, so that others can retry running them?
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Thu, 7 Sep 2017 23:08:14 +0200
Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
Hi,
On 08/17/2017 12:23 PM, Ildus Kurbangaliev wrote:
In my benchmarks when database fits into buffers (so it's
measurement of the time required for the tsvectors conversion) it
gives me these results:Without conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:04:44 Number of connections: 4
2017/08/17 12:04:44 Database: test1
2017/08/17 12:09:44 Processed: 51419With conversion:
$ ./tsbench2 -database test1 -bench_time 300
2017/08/17 12:14:31 Number of connections: 4
2017/08/17 12:14:31 Database: test1
2017/08/17 12:19:31 Processed: 43607I ran a bunch of these tests, and these results are stable on my
machine. So in these specific tests performance regression about
15%.Same time I think this could be the worst case, because usually data
is on disk and conversion will not affect so much to performance.That seems like a fairly significant regression, TBH. I don't quite
agree we can simply assume in-memory workloads don't matter, plenty of
databases have 99% cache hit ratio (particularly when considering not
just shared buffers, but also page cache).
I think part of this regression is caused by better compression of new
format. I can't say exact percent here, need to check with perf.
If you care about performace, you create indexes, which means that
tsvector will no longer be used for text search (except for ORDER BY
rank). Index machinery will only peek into tsquery. Moreover, RUM index
stores positions + lexemes, so it doesn't need tsvectors for ranked
search. As a result, tsvector becomes a storage for
building indexes (indexable type), not something that should be used at
runtime. And the change of the format doesn't affect index creation
time.
Can you share the benchmarks, so that others can retry running them?
Benchmarks are published at github:
https://github.com/ildus/tsbench . I'm not sure that they are easy to
use.
Best regards,
Ildus Kurbangaliev
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Sep 11, 2017 at 5:33 AM, Ildus Kurbangaliev
<i.kurbangaliev@postgrespro.ru> wrote:
Moreover, RUM index
stores positions + lexemes, so it doesn't need tsvectors for ranked
search. As a result, tsvector becomes a storage for
building indexes (indexable type), not something that should be used at
runtime. And the change of the format doesn't affect index creation
time.
RUM indexes, though, are not in core.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 09/11/2017 01:54 PM, Robert Haas wrote:
On Mon, Sep 11, 2017 at 5:33 AM, Ildus Kurbangaliev
<i.kurbangaliev@postgrespro.ru> wrote:Moreover, RUM index
stores positions + lexemes, so it doesn't need tsvectors for ranked
search. As a result, tsvector becomes a storage for
building indexes (indexable type), not something that should be used at
runtime. And the change of the format doesn't affect index creation
time.RUM indexes, though, are not in core.
Yeah, but I think Ildus has a point that this should not really matter
on indexed tsvectors. So the question is how realistic that benchmark
actually is. How likely are we to do queries on fts directly, not
through a GIN/GiST index? Particularly in performance-sensitive cases?
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Mon, Sep 11, 2017 at 9:51 PM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:
On 09/11/2017 01:54 PM, Robert Haas wrote:
On Mon, Sep 11, 2017 at 5:33 AM, Ildus Kurbangaliev
<i.kurbangaliev@postgrespro.ru> wrote:Moreover, RUM index
stores positions + lexemes, so it doesn't need tsvectors for ranked
search. As a result, tsvector becomes a storage for
building indexes (indexable type), not something that should be used at
runtime. And the change of the format doesn't affect index creation
time.RUM indexes, though, are not in core.
Yeah, but I think Ildus has a point that this should not really matter
on indexed tsvectors. So the question is how realistic that benchmark
actually is. How likely are we to do queries on fts directly, not
through a GIN/GiST index? Particularly in performance-sensitive cases?
So many questions unanswered... I am marking the patch as returned
with feedback as the thread has stalled for two months now.
--
Michael