Tsvector editing functions
Hello.
There is patch that adds some editing routines for tsvector type.
tsvector delete(tsvector, text)
removes entry from tsvector by lexeme name
set unnest(tsvector)
expands a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.
text[] to_array(tsvector)
converts tsvector to array of lexemes
tsvector to_tsvector(text[])
converts array of lexemes to tsvector
Attachments:
tsvector_funcs.diffapplication/octet-stream; name=tsvector_funcs.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
new file mode 100644
index e87210d..0990eb4
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
*************** CREATE TYPE rainbow AS ENUM ('red', 'ora
*** 9080,9085 ****
--- 9080,9135 ----
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ </indexterm>
+ <literal><function>delete(<type>tsvector</>, <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove entry from <type>tsvector</></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {0}
+ fat {2,4} {0,0}
+ rat {5} {3}</literallayout>(3 rows)</entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>array_to_tsvector</primary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
new file mode 100644
index e822ba8..cc998c2
*** a/src/backend/utils/adt/tsvector_op.c
--- b/src/backend/utils/adt/tsvector_op.c
*************** add_pos(TSVector src, WordEntry *srcptr,
*** 291,296 ****
--- 291,515 ----
return *clen - startlen;
}
+ Datum
+ tsvector_delete(PG_FUNCTION_ARGS)
+ {
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *lexin = TextDatumGetCString(PG_GETARG_DATUM(1)),
+ *data,
+ *cur;
+ int i,
+ j,
+ lexin_len = strlen(lexin),
+ shrink_len,
+ skip_index = -1,
+ curoff = 0,
+ len = 0;
+
+ data = STRPTR(tsin);
+ for (i = 0; i < tsin->size; i++)
+ {
+ if ( (arrin[i].len == lexin_len) && !strncmp(lexin, data + arrin[i].pos, arrin[i].len))
+ break;
+ }
+
+ /* nothing to delete */
+ if (i == tsin->size)
+ PG_RETURN_POINTER(tsin);
+
+ /* otherwise we can skip i-th lexeme */
+ skip_index = i;
+ shrink_len = POSDATALEN(tsin, arrin+i) * sizeof(WordEntryPos)
+ + sizeof(WordEntry) + sizeof(uint16);
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin) - shrink_len);
+ SET_VARSIZE(tsout, VARSIZE(tsin) - shrink_len);
+ tsout->size = tsin->size - 1;
+ arrout = ARRPTR(tsout);
+
+ cur = STRPTR(tsout);
+
+ for (i = 0, j = 0; i < tsin->size; i++)
+ {
+ if (i == skip_index)
+ continue;
+
+ memcpy(cur + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+
+ curoff += arrin[i].len;
+
+ if (arrin[i].haspos)
+ {
+ curoff = SHORTALIGN(curoff);
+ len = POSDATALEN(tsin, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ memcpy(cur + curoff, (STRPTR(tsin) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+ }
+
+ Datum
+ tsvector_unnest(PG_FUNCTION_ARGS)
+ {
+ FuncCallContext *funcctx;
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "postings",
+ INT4ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ INT2ARRAYOID, -1, 0);
+
+ funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ data = STRPTR(tsin);
+ if (funcctx->call_cntr < tsin->size)
+ {
+ char *values[3],
+ *buf;
+ HeapTuple tuple;
+ int i = funcctx->call_cntr,
+ j;
+ WordEntryPosVector *posv;
+ StringInfo postings = makeStringInfo();
+ StringInfo weights = makeStringInfo();
+
+ buf = palloc(sizeof(char) * (arrin[funcctx->call_cntr].len + 1) );
+ memcpy(buf, data + arrin[funcctx->call_cntr].pos, arrin[funcctx->call_cntr].len);
+ buf[arrin[funcctx->call_cntr].len] = '\0';
+ values[0] = buf;
+
+ if (arrin[i].haspos)
+ {
+ posv = (WordEntryPosVector *)(STRPTR(tsin) + SHORTALIGN(arrin[i].len + arrin[i].pos));
+
+ appendStringInfo(postings, "{");
+ appendStringInfo(weights, "{");
+ for (j = 0; j < posv->npos; j++)
+ {
+ appendStringInfo(postings, "%d", WEP_GETPOS(posv->pos[j]));
+ appendStringInfo(weights, "%d", WEP_GETWEIGHT(posv->pos[j]));
+ if (j != posv->npos-1){
+ appendStringInfoChar(postings, ',');
+ appendStringInfoChar(weights, ',');
+ }
+ }
+ appendStringInfo(postings, "}");
+ appendStringInfo(weights, "}");
+
+ values[1] = postings->data;
+ values[2] = weights->data;
+ } else {
+ *values[1] = '\0';
+ *values[2] = '\0';
+ }
+
+ tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+ }
+
+ Datum
+ tsvector_to_array(PG_FUNCTION_ARGS)
+ {
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos,
+ arrin[i].len));
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+ }
+
+ Datum
+ array_to_tsvector(PG_FUNCTION_ARGS)
+ {
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ Oid element_type = ARR_ELEMTYPE(v);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ *lexlens,
+ lexlen = 0;
+ char *cur,
+ **lexemes;
+
+ if (element_type != TEXTOID)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("Only arrays of strings can be converted to tsvector")));
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nitems);
+
+ lexemes = (char **) palloc(nitems * sizeof(char *));
+ lexlens = palloc(nitems * sizeof(int));
+ for (i = 0; i < nitems; i++)
+ {
+ lexemes[i] = TextDatumGetCString(dlexemes[i]);
+ lexlens[i] = strlen(lexemes[i]);
+ lexlen += lexlens[i];
+ }
+
+ tslen = CALCDATASIZE(nitems, lexlen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ memcpy(cur, lexemes[i], lexlens[i]);
+ arrout[i].haspos = 0;
+ arrout[i].len = lexlens[i];
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lexlens[i];
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+ }
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
new file mode 100644
index eb55b3a..f7fb490
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DATA(insert OID = 3624 ( setweight P
*** 4574,4579 ****
--- 4574,4588 ----
DESCR("set weight of lexeme's entries");
DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+ DATA(insert OID = 3315 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete _null_ _null_ _null_ ));
+ DESCR("delete lexeme");
+ DATA(insert OID = 3316 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1007,1005}" "{i,o,o,o}" "{tsvector,lexeme,postings,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+ DESCR("expand tsvector to set of rows");
+ DATA(insert OID = 3317 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+ DESCR("convert to lexeme's array");
+ DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+ DESCR("build tsvector from lexeme's array");
+
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
DATA(insert OID = 3760 ( ts_match_tt PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 16 "25 25" _null_ _null_ _null_ _null_ _null_ ts_match_tt _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
new file mode 100644
index 281cdd6..81e7bbb
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
*************** extern Datum tsvector_length(PG_FUNCTION
*** 142,147 ****
--- 142,152 ----
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+ extern Datum tsvector_delete(PG_FUNCTION_ARGS);
+ extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+ extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+ extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
new file mode 100644
index 6284fb6..faa33e5
*** a/src/test/regress/expected/tstypes.out
--- b/src/test/regress/expected/tstypes.out
*************** SELECT ts_rank_cd(' a:1 s:2 d g'::tsvect
*** 625,627 ****
--- 625,687 ----
0.1
(1 row)
+ SELECT delete(to_tsvector('Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+ ------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+ (1 row)
+
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+ --------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+ (1 row)
+
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+ ------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+ (1 row)
+
+ SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+ ---------------------------------------------
+ (base,{7},{0})
+ (hidden,{6},{0})
+ (rebel,{1},{0})
+ (spaceship,"{2,33,34,35,36}","{0,3,2,1,0}")
+ (strike,{3},{0})
+ (5 rows)
+
+ SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | postings | weights
+ -----------+-----------------+-------------
+ base | {7} | {0}
+ hidden | {6} | {0}
+ rebel | {1} | {0}
+ spaceship | {2,33,34,35,36} | {0,3,2,1,0}
+ strike | {3} | {0}
+ (5 rows)
+
+ SELECT lexeme, postings[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | postings
+ -----------+----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+ (5 rows)
+
+ SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+ --------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+ (1 row)
+
+ SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+ ----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+ (1 row)
+
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
new file mode 100644
index fd7c702..51baf30
*** a/src/test/regress/sql/tstypes.sql
--- b/src/test/regress/sql/tstypes.sql
*************** SELECT ts_rank_cd(' a:1 s:2 d g'::tsvect
*** 115,117 ****
--- 115,126 ----
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+ SELECT delete(to_tsvector('Rebel spaceships, striking from a hidden base'), 'spaceship');
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT lexeme, postings[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
On Mon, Oct 5, 2015 at 1:29 PM, Stas Kelvich <s.kelvich@postgrespro.ru> wrote:
Hello.
There is patch that adds some editing routines for tsvector type.
tsvector delete(tsvector, text)
removes entry from tsvector by lexeme name
set unnest(tsvector)
expands a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.
text[] to_array(tsvector)
converts tsvector to array of lexemes
tsvector to_tsvector(text[])
converts array of lexemes to tsvector
When submitting a patch, it's a good idea to explain why someone would
want the feature you are adding. Maybe that's obvious to you, but it
isn't clear to me why we'd want this.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
There is patch that adds some editing routines for tsvector type.
...
When submitting a patch, it's a good idea to explain why someone would
want the feature you are adding. Maybe that's obvious to you, but it
isn't clear to me why we'd want this.
Some examples:
tsvector delete(tsvector, text)
remove wronlgy indexed word (may, be a stop word)
text[] to_array(tsvector)
In my practice, I needed it to work with smlar module.
tsvector to_tsvector(text[])
Converts list of tags to tsvector, because search in tsvector is more
flexible and fast than array's equivalents
set unnest(tsvector)
Count some complicated statistics.
That functions mostly needed in utility processing rather in workflow.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
1 Please, make patch compilable with current master.
cd ../../../src/include/catalog && '/usr/local/bin/perl' ./duplicate_oids
3315
3316
2 lexin = TextDatumGetCString(PG_GETARG_DATUM(1))
lexin_len = strlen(lexin)
Why do you use C-string instead of just text? Suppose, much better:
t = PG_GETARG_TEXT_P(1)
lexin = VARDATA(t)
lexin_len = VARSIZE_ANY_EXHDR(t)
3 Why do you use linear search in tsvector instead of binary search? It could
produce a performance impact
4 Again, using BuildTupleFromCStrings() call is not very optimal
5 printing weights as numbers is not consistent with other usage of weigth's in
FTS. Lexem's weight are mentioned as one of A,B,C,D and default weight is a D.
Teodor Sigaev wrote:
There is patch that adds some editing routines for tsvector type.
...
When submitting a patch, it's a good idea to explain why someone would
want the feature you are adding. Maybe that's obvious to you, but it
isn't clear to me why we'd want this.Some examples:
tsvector delete(tsvector, text)
remove wronlgy indexed word (may, be a stop word)
text[] to_array(tsvector)
In my practice, I needed it to work with smlar module.
tsvector to_tsvector(text[])
Converts list of tags to tsvector, because search in tsvector is more
flexible and fast than array's equivalents
set unnest(tsvector)
Count some complicated statistics.That functions mostly needed in utility processing rather in workflow.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hmm, seems, it will be useful to add two fuctions:
tsvector filter(tsvector, array_of_weigths) - returns tsvector contains lexemes
with given weights
tsvector setweight(tsvector, weigth, array_of_lexemes) - sets given weight for
given lexemes
Stas Kelvich wrote:
Hello.
There is patch that adds some editing routines for tsvector type.
tsvector delete(tsvector, text)
removes entry from tsvector by lexeme name
set unnest(tsvector)
expands a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.
text[] to_array(tsvector)
converts tsvector to array of lexemes
tsvector to_tsvector(text[])
converts array of lexemes to tsvectorStas Kelvich
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hello.
Done with the list of suggestions. Also heavily edit delete function.
Attachments:
tsvector_ops.diffapplication/octet-stream; name=tsvector_ops.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
new file mode 100644
index 60b9a09..62ac7fb
*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
*************** CREATE TYPE rainbow AS ENUM ('red', 'ora
*** 9096,9101 ****
--- 9096,9113 ----
<row>
<entry>
<indexterm>
+ <primary>setweight_by_filter</primary>
+ </indexterm>
+ <literal><function>setweight(<type>tsvector</>, <type>"char"</>, <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign weight to elements of <type>tsvector</> that are listed in array given as a third argument</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
*************** CREATE TYPE rainbow AS ENUM ('red', 'ora
*** 9108,9113 ****
--- 9120,9188 ----
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ </indexterm>
+ <literal><function>delete(<type>tsvector</>, <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove entry from <type>tsvector</></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {A}
+ fat {2,4} {D,D}
+ rat {5} {A}
+ (3 rows)</literallayout></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>array_to_tsvector</primary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<type>tsvector</>, <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given weights from <type>tsvector</type></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
new file mode 100644
index e822ba8..05beaa9
*** a/src/backend/utils/adt/tsvector_op.c
--- b/src/backend/utils/adt/tsvector_op.c
***************
*** 14,19 ****
--- 14,20 ----
#include "postgres.h"
+ #include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
*************** tsvector_length(PG_FUNCTION_ARGS)
*** 195,200 ****
--- 196,294 ----
}
Datum
+ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+ {
+ TSVector in = PG_GETARG_TSVECTOR(0);
+ char cw = PG_GETARG_CHAR(1);
+ ArrayType *lexarr = NULL;
+ TSVector out;
+ int i,
+ j,
+ nlex = 0,
+ lex_len,
+ w = 0,
+ StopLow, StopHigh, StopMiddle, cmp;;
+ WordEntry *entry;
+ WordEntryPos *p;
+ Datum *dlexemes;
+ bool *nulls;
+ char *data,
+ *lex;
+
+ switch (cw)
+ {
+ case 'A':
+ case 'a':
+ w = 3;
+ break;
+ case 'B':
+ case 'b':
+ w = 2;
+ break;
+ case 'C':
+ case 'c':
+ w = 1;
+ break;
+ case 'D':
+ case 'd':
+ w = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", cw);
+ }
+
+ out = (TSVector) palloc(VARSIZE(in));
+ memcpy(out, in, VARSIZE(in));
+ entry = ARRPTR(out);
+
+ lexarr = PG_GETARG_ARRAYTYPE_P(2);
+ deconstruct_array(lexarr, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+ data = STRPTR(out);
+
+ /*
+ * Assuming that lexarr is significantly shorter than tsvector
+ * we can iterate through lexarr performing binary search
+ * of each lexeme from lexarr in tsvector.
+ */
+ for (i = 0; i < nlex; i++)
+ {
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ StopLow = 0;
+ StopHigh = out->size;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+ cmp = tsCompareString(lex, lex_len,
+ data + entry[StopMiddle].pos, entry[StopMiddle].len, false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ break;
+ }
+
+ if (StopLow < StopHigh && (j = POSDATALEN(out, entry + StopMiddle)) != 0 )
+ {
+ p = POSDATAPTR(out, entry + StopMiddle);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, w);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(in, 0);
+ PG_RETURN_POINTER(out);
+ }
+
+ Datum
tsvector_setweight(PG_FUNCTION_ARGS)
{
TSVector in = PG_GETARG_TSVECTOR(0);
*************** tsvector_setweight(PG_FUNCTION_ARGS)
*** 226,232 ****
break;
default:
/* internal error */
! elog(ERROR, "unrecognized weight: %d", cw);
}
out = (TSVector) palloc(VARSIZE(in));
--- 320,326 ----
break;
default:
/* internal error */
! elog(ERROR, "unrecognized weight: %c", cw);
}
out = (TSVector) palloc(VARSIZE(in));
*************** add_pos(TSVector src, WordEntry *srcptr,
*** 291,296 ****
--- 385,719 ----
return *clen - startlen;
}
+ Datum
+ tsvector_delete(PG_FUNCTION_ARGS)
+ {
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *lexin = VARDATA(PG_GETARG_TEXT_P(1)),
+ *data,
+ *cur;
+ int i,
+ j,
+ StopLow, StopHigh, StopMiddle, cmp,
+ lexin_len = VARSIZE_ANY_EXHDR(PG_GETARG_TEXT_P(1)),
+ shrink_len,
+ skip_index = -1,
+ curoff = 0,
+ len = 0;
+
+ data = STRPTR(tsin);
+
+ /* lexemes are sorted, so we can use binary search */
+ StopLow = 0;
+ StopHigh = tsin->size;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+ cmp = tsCompareString(lexin, lexin_len,
+ data + arrin[StopMiddle].pos, arrin[StopMiddle].len, false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ break;
+ }
+
+ if (StopLow >= StopHigh)
+ PG_RETURN_POINTER(tsin);
+ else
+ skip_index = StopMiddle;
+
+ shrink_len = sizeof(WordEntry) + arrin[skip_index].len;
+
+ if (arrin[skip_index].haspos)
+ shrink_len += sizeof(uint16) +
+ POSDATALEN(tsin, arrin+skip_index) * sizeof(WordEntryPos);
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin) - shrink_len);
+ SET_VARSIZE(tsout, VARSIZE(tsin) - shrink_len);
+ tsout->size = tsin->size - 1;
+ arrout = ARRPTR(tsout);
+
+ cur = STRPTR(tsout);
+ for (i = 0, j = 0; i < tsin->size; i++)
+ {
+ if (i == skip_index)
+ continue;
+
+ memcpy(cur + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+
+ curoff += arrin[i].len;
+
+ if (arrin[i].haspos)
+ {
+ curoff = SHORTALIGN(curoff);
+ len = POSDATALEN(tsin, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ memcpy(cur + curoff, (STRPTR(tsin) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+ }
+
+ Datum
+ tsvector_unnest(PG_FUNCTION_ARGS)
+ {
+ FuncCallContext *funcctx;
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "postings",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ data = STRPTR(tsin);
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntryPosVector *posv;
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ posv = (WordEntryPosVector *)(STRPTR(tsin) + SHORTALIGN(arrin[i].pos+arrin[i].len));
+
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = (WEP_GETWEIGHT(posv->pos[j]) >> 2) ?
+ 'D' : 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, 1));
+ }
+ values[1] = PointerGetDatum(construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ values[1] = PointerGetDatum(construct_array(NULL, 0, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(construct_array(NULL, 0, TEXTOID, -1, false, 'i'));
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+ }
+
+ Datum
+ tsvector_to_array(PG_FUNCTION_ARGS)
+ {
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len));
+ }
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+ }
+
+ Datum
+ array_to_tsvector(PG_FUNCTION_ARGS)
+ {
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ *lexlens,
+ lexlen = 0;
+ char *cur,
+ **lexemes;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nitems);
+
+ lexemes = (char **) palloc(nitems * sizeof(char *));
+ lexlens = palloc(nitems * sizeof(int));
+
+ for (i = 0; i < nitems; i++)
+ {
+ text *lextext = DatumGetTextP(dlexemes[i]);
+
+ lexemes[i] = VARDATA(lextext);
+ lexlens[i] = VARSIZE(lextext) - VARHDRSZ;
+ lexlen += lexlens[i];
+ }
+
+ tslen = CALCDATASIZE(nitems, lexlen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ memcpy(cur, lexemes[i], lexlens[i]);
+ arrout[i].haspos = 0;
+ arrout[i].len = lexlens[i];
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lexlens[i];
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+ }
+
+ Datum
+ tsvector_filter(PG_FUNCTION_ARGS)
+ {
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ TSVector tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ Datum *dweights;
+ bool *nulls;
+ int nweigths,
+ npos;
+ int i, j, k;
+ char cw,
+ mask = 0,
+ cur_pos,
+ *datain,
+ *dataout;
+ WordEntry *arrin,
+ *arrout;
+ WordEntryPosVector *posvin,
+ *posvout;
+ bool lexeme2copy;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ cw = DatumGetChar(dweights[i]);
+ switch (cw)
+ {
+ case 'A':
+ case 'a':
+ mask = mask | 8;
+ break;
+ case 'B':
+ case 'b':
+ mask = mask | 4;
+ break;
+ case 'C':
+ case 'c':
+ mask = mask | 2;
+ break;
+ case 'D':
+ case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", cw);
+ }
+ }
+
+ arrin = ARRPTR(tsin);
+ datain = STRPTR(tsin);
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ cur_pos = 0;
+ for (i = 0, k = 0; i < tsin->size; i++)
+ {
+ if (!arrin[i].haspos)
+ continue;
+
+ npos = 0;
+ lexeme2copy = false;
+ posvin = (WordEntryPosVector *)(datain + SHORTALIGN(arrin[i].pos+arrin[i].len));
+ posvout = (WordEntryPosVector *)(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (j = 0; j < posvin->npos; j++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[j])))
+ posvout->pos[npos++] = posvin->pos[j];
+ }
+
+ if (!npos) /* no satisfactory postings found, so skip that lexeme */
+ continue;
+
+ arrout[k].haspos = true;
+ arrout[k].len = arrin[i].len;
+ arrout[k].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+k) * sizeof(WordEntryPos) + sizeof(uint16);
+ k++;
+ }
+
+ tsout->size = k;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+ }
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
new file mode 100644
index d8640db..28df115
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DATA(insert OID = 3623 ( strip PGNS
*** 4576,4582 ****
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
DESCR("set weight of lexeme's entries");
! DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
--- 4576,4594 ----
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
DESCR("set weight of lexeme's entries");
! DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
! DESCR("set weight of lexeme's entries");
! DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
! DATA(insert OID = 3315 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete _null_ _null_ _null_ ));
! DESCR("delete lexeme");
! DATA(insert OID = 3316 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,postings,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
! DESCR("expand tsvector to set of rows");
! DATA(insert OID = 3317 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
! DESCR("convert to lexeme's array");
! DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
! DESCR("build tsvector from lexeme's array");
! DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
! DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
new file mode 100644
index 281cdd6..78d3d0d
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
*************** extern Datum tsvector_cmp(PG_FUNCTION_AR
*** 141,147 ****
--- 141,154 ----
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+ extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+ extern Datum tsvector_delete(PG_FUNCTION_ARGS);
+ extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+ extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+ extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+ extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
new file mode 100644
index 6284fb6..a02a56b
*** a/src/test/regress/expected/tstypes.out
--- b/src/test/regress/expected/tstypes.out
*************** SELECT 'a:3A b:2a'::tsvector || 'ba:1234
*** 83,100 ****
'a':3A,4B 'b':2A 'ba':1237
(1 row)
- SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
- ----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
- (1 row)
-
- SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
- ---------------
- 'a' 'asd' 'w'
- (1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
--- 83,88 ----
*************** SELECT ts_rank_cd(' a:1 s:2 d g'::tsvect
*** 625,627 ****
--- 613,774 ----
0.1
(1 row)
+ -- tsvector editing operations
+ SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+ ---------------
+ 'a' 'asd' 'w'
+ (1 row)
+
+ SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+ ----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+ (1 row)
+
+ SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+ ----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+ (1 row)
+
+ SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+ ------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+ (1 row)
+
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+ --------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+ (1 row)
+
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+ ------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+ (1 row)
+
+ SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+ ----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+ (1 row)
+
+ SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+ ---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+ (5 rows)
+
+ SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+ -------------------
+ (base,{},{})
+ (hidden,{},{})
+ (rebel,{},{})
+ (spaceship,{},{})
+ (strike,{},{})
+ (5 rows)
+
+ SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | postings | weights
+ -----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+ (5 rows)
+
+ SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | postings | weights
+ -----------+----------+---------
+ base | {} | {}
+ hidden | {} | {}
+ rebel | {} | {}
+ spaceship | {} | {}
+ strike | {} | {}
+ (5 rows)
+
+ SELECT lexeme, postings[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | postings
+ -----------+----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+ (5 rows)
+
+ SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+ --------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+ (1 row)
+
+ SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+ --------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+ (1 row)
+
+ SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+ ----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+ (1 row)
+
+ SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+ ----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+ (1 row)
+
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+ ----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+ (1 row)
+
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+ ------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+ (1 row)
+
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+ ------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+ (1 row)
+
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+ --------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+ (1 row)
+
+ SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+ ---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+ (1 row)
+
+ SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+ -------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+ (1 row)
+
+ SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+ --------
+
+ (1 row)
+
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
new file mode 100644
index fd7c702..2288bbd
*** a/src/test/regress/sql/tstypes.sql
--- b/src/test/regress/sql/tstypes.sql
*************** SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\
*** 14,21 ****
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
- SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
--- 14,19 ----
*************** SELECT ts_rank_cd(' a:1 s:2 d g'::tsvect
*** 115,117 ****
--- 113,148 ----
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+ -- tsvector editing operations
+
+ SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+ SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+ SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ SELECT lexeme, postings[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+ SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+ SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+
+ SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+
+ SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+
Hi,
On 12/07/2015 03:05 PM, Stas Kelvich wrote:
Hello.
Done with the list of suggestions. Also heavily edit delete function.
I did a quick review of the updated patch. I'm not a tsvector-expert so
hopefully my comments won't be entirely bogus.
1) It's a bit difficult to judge the usefulness of the API, as I've
always been a mere user of full-text search, and I never had a need
(or courage) to mess with the tsvectors. OTOH I don't see a good
reason no to have such API, when there's a need for it.
The API seems to be reasonably complete, with one exception - when
looking at editing function we have for 'hstore', we do have these
variants for delete()
delete(hstore,text)
delete(hstore,text[])
delete(hstore,hstore)
while this patch only adds delete(tsvector,text). Would it make
sense to add variants similar to hstore? It probably does not make
much sense to add delete(tsvector,tsvector), right? But being able
to delete a bunch of lexemes in one go seems like a good thing.
What do you think?
2) tsvector_op.c needs a bit of love, to eliminate the two warnings it
currently triggers:
tsvector_op.c:211:2: warning: ISO C90 forbids mixed ...
tsvector_op.c:635:9: warning: variable ‘lexeme2copy’ set but ...
3) the patch also touches tsvector_setweight(), only to do change:
elog(ERROR, "unrecognized weight: %d", cw);
to
elog(ERROR, "unrecognized weight: %c", cw);
That should probably go get committed separately, as a bugfix.
4) I find it rather annoying that there are pretty much no comments in
the code. Granted, there are pretty much no comments in the
surrounding code, but I doubt that's a good reason for not having
any comments in new code. It makes reviews unnecessarily difficult.
5) tsvector_concat() is not mentioned in docs at all
6) Docs don't mention names of the new parameters in function
signatures, just data types. The functions with a single parameter
probably don't need to do that, but multi-parameter ones should.
7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On Tue, Dec 15, 2015 at 12:07 PM, Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:
Hi,
On 12/07/2015 03:05 PM, Stas Kelvich wrote:
Hello.
Done with the list of suggestions. Also heavily edit delete function.
I did a quick review of the updated patch. I'm not a tsvector-expert so
hopefully my comments won't be entirely bogus.1) It's a bit difficult to judge the usefulness of the API, as I've
always been a mere user of full-text search, and I never had a need
(or courage) to mess with the tsvectors. OTOH I don't see a good
reason no to have such API, when there's a need for it.The API seems to be reasonably complete, with one exception - when
looking at editing function we have for 'hstore', we do have these
variants for delete()delete(hstore,text)
delete(hstore,text[])
delete(hstore,hstore)while this patch only adds delete(tsvector,text). Would it make
sense to add variants similar to hstore? It probably does not make
much sense to add delete(tsvector,tsvector), right? But being able
to delete a bunch of lexemes in one go seems like a good thing.What do you think?
2) tsvector_op.c needs a bit of love, to eliminate the two warnings it
currently triggers:tsvector_op.c:211:2: warning: ISO C90 forbids mixed ...
tsvector_op.c:635:9: warning: variable ‘lexeme2copy’ set but ...3) the patch also touches tsvector_setweight(), only to do change:
elog(ERROR, "unrecognized weight: %d", cw);
to
elog(ERROR, "unrecognized weight: %c", cw);
That should probably go get committed separately, as a bugfix.
4) I find it rather annoying that there are pretty much no comments in
the code. Granted, there are pretty much no comments in the
surrounding code, but I doubt that's a good reason for not having
any comments in new code. It makes reviews unnecessarily difficult.5) tsvector_concat() is not mentioned in docs at all
6) Docs don't mention names of the new parameters in function
signatures, just data types. The functions with a single parameter
probably don't need to do that, but multi-parameter ones should.7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?
I have marked this patch as returned with feedback based on the
presence of a review and a lack of replies from the author. Stas, if
you are still working on the patch, please feel free to move it to the
next commit fest.
--
Michael
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi, Tomáš! Thanks for comprehensive review.
On 15 Dec 2015, at 06:07, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
1) It's a bit difficult to judge the usefulness of the API, as I've
always been a mere user of full-text search, and I never had a need
(or courage) to mess with the tsvectors. OTOH I don't see a good
reason no to have such API, when there's a need for it.The API seems to be reasonably complete, with one exception - when
looking at editing function we have for 'hstore', we do have these
variants for delete()delete(hstore,text)
delete(hstore,text[])
delete(hstore,hstore)while this patch only adds delete(tsvector,text). Would it make
sense to add variants similar to hstore? It probably does not make
much sense to add delete(tsvector,tsvector), right? But being able
to delete a bunch of lexemes in one go seems like a good thing.What do you think?
That’s a good idea and actually deleting tsvector from tsvector makes perfect sense. In delete function I used exact string match between string and lexemes in tsvector, but if somebody wants to delete for example “Cats” from tsvector, then he should downcase and singularize this word. Easiest way to do it is to just use to_tsvector() function. Also we can use this function to delete specific positions: like delete('cat:3 fat:2,4'::tsvector, 'fat:2'::tsvector) -> 'cat:3 fat:4'::tsvector.
So in attached patch I’ve implemented following:
delete(tsin tsvector, lexarrtext[]) — remove any occurence of lexemes inlexarr from tsin
delete(tsin tsvector, tsv_filter tsvector) — Delete lexemes and/or positions of tsv_filter from tsin. When lexeme in tsv_filter has no positions function will delete any occurrence of same lexeme in tsin. When tsv_filter lexeme have positions function will delete them from positions of matching lexeme in tsin. If after such removal resulting positions set is empty then function will delete that lexeme from resulting tsvector.
Also if we want some level of completeness of API and taking into account that concat() function shift positions on second argument I thought that it can be useful to also add function that can shift all positions of specific value. This helps to undo concatenation: delete one of concatenating tsvectors and then shift positions in resulting tsvector. So I also wrote one another small function:
shift(tsin tsvector,offset int16) — Shift all positions in tsin by given offset
2) tsvector_op.c needs a bit of love, to eliminate the two warnings it
currently triggers:tsvector_op.c:211:2: warning: ISO C90 forbids mixed ...
tsvector_op.c:635:9: warning: variable ‘lexeme2copy’ set but …
fixed
3) the patch also touches tsvector_setweight(), only to do change:
elog(ERROR, "unrecognized weight: %d", cw);
to
elog(ERROR, "unrecognized weight: %c", cw);
That should probably go get committed separately, as a bugfix.
Okay, i’ll submit that as a separate patch.
4) I find it rather annoying that there are pretty much no comments in
the code. Granted, there are pretty much no comments in the
surrounding code, but I doubt that's a good reason for not having
any comments in new code. It makes reviews unnecessarily difficult.
Fixed, I think.
5) tsvector_concat() is not mentioned in docs at all
Concat mentioned in docs as an operator ||.
6) Docs don't mention names of the new parameters in function
signatures, just data types. The functions with a single parameter
probably don't need to do that, but multi-parameter ones should.
Fixed.
7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?
Because sgml compiler wants unique indexterm. Both functions that you mentioned use overloading of arguments and have non-unique name.
Attachments:
tsvector_ops.diffapplication/octet-stream; name=tsvector_ops.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e08bf60..fe93058 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9086,16 +9086,28 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">w</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign weight <replaceable class="PARAMETER">w</replaceable> to each element of <replaceable class="PARAMETER">tsin</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight_by_filter</primary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">w</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexarr</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign weight <replaceable class="PARAMETER">w</replaceable> to elements of <replaceable class="PARAMETER">tsin</replaceable> that are listed in lexemes array <replaceable class="PARAMETER">lexarr</replaceable></entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9108,6 +9120,108 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete_str</primary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete_arr</primary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexarr</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexarr</replaceable> from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete_tsvector</primary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">tsv_filter</replaceable> <type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>
+ Delete lexemes and/or positions of <replaceable class="PARAMETER">tsv_filter</replaceable> from <replaceable class="PARAMETER">tsin</replaceable>.
+ When lexeme in <replaceable class="PARAMETER">tsv_filter</replaceable> has no positions function will delete any occurence of same lexeme in <replaceable class="PARAMETER">tsin</replaceable>. When <replaceable class="PARAMETER">tsv_filter</replaceable> lexeme have positions function will delete them from positions of matching lexeme in <replaceable class="PARAMETER">tsin</replaceable>. If after such removal resulting positions set is empty then function will delete that lexeme from resulting tsvector.
+ </entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat:2,6 rat'::tsvector)</literal></entry>
+ <entry><literal>'cat':3 'fat':4</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {A}
+fat {2,4} {D,D}
+rat {5} {A}
+(3 rows)</literallayout></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>array_to_tsvector</primary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>shift</primary>
+ </indexterm>
+ <literal><function>shift(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">offset</replaceable> <type>int16</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Shift all positions in <replaceable class="PARAMETER">tsin</replaceable> by given <replaceable class="PARAMETER">offset</replaceable></entry>
+ <entry><literal>shift('fat:2,4 cat:3b rat:5A'::tsvector, 10)</literal></entry>
+ <entry><literal>'cat':13B 'fat':12,14 'rat':15A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index d66b4d5..32033fa 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index e822ba8..adc2128 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -194,6 +196,83 @@ tsvector_length(PG_FUNCTION_ARGS)
PG_RETURN_INT32(ret);
}
+/*
+ * setweight(tsin tsvector, w "char", lexarr "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexarr.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector in = PG_GETARG_TSVECTOR(0);
+ char cw = PG_GETARG_CHAR(1);
+ ArrayType *lexarr = NULL;
+ TSVector out;
+ int i,
+ j,
+ nlex = 0,
+ lex_len,
+ w = 0,
+ lex_pos;
+ WordEntry *entry;
+ WordEntryPos *p;
+ Datum *dlexemes;
+ bool *nulls;
+ char *lex;
+
+ switch (cw)
+ {
+ case 'A': case 'a':
+ w = 3;
+ break;
+ case 'B': case 'b':
+ w = 2;
+ break;
+ case 'C': case 'c':
+ w = 1;
+ break;
+ case 'D': case 'd':
+ w = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %d", cw);
+ }
+
+ out = (TSVector) palloc(VARSIZE(in));
+ memcpy(out, in, VARSIZE(in));
+ entry = ARRPTR(out);
+
+ lexarr = PG_GETARG_ARRAYTYPE_P(2);
+ deconstruct_array(lexarr, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * Assuming that lexarr is significantly shorter than tsvector
+ * we can iterate through lexarr performing binary search
+ * of each lexeme from lexarr in tsvector.
+ */
+ for (i = 0; i < nlex; i++)
+ {
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(out, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(out, entry + lex_pos)) != 0 )
+ {
+ p = POSDATAPTR(out, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, w);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(in, 0);
+ PG_RETURN_POINTER(out);
+}
+
Datum
tsvector_setweight(PG_FUNCTION_ARGS)
{
@@ -291,6 +370,635 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+_tsvector_delete(TSVector tsv, int *indices_to_delete, int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in indices_to_delete
+ * or not. Since indices_to_delete is already sorted we can advance
+ * it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff, (STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of [0, tsv->size)
+ * range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ char *lexin = VARDATA(PG_GETARG_TEXT_P(1));
+ int lexin_len = VARSIZE_ANY_EXHDR(PG_GETARG_TEXT_P(1)),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexin, lexin_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = _tsvector_delete(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexarr = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexarr, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex*sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ int lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = _tsvector_delete(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete lexemes and/or positions of second tsvector from first tsvector.
+ * Implementation of user-level delete(tsvector, tsvector).
+ *
+ * When lexeme in second tsvector (ts2) has no positions function will
+ * delete any occurence of same lexeme in first tsvector (ts1).
+ * When ts2 lexeme have positions function will delete them from positions
+ * of matching lexeme in ts1. If after such removal resulting positions set is
+ * empty then function will delete that lexeme from resulting tsvector.
+ */
+Datum
+tsvector_delete_tsvector(PG_FUNCTION_ARGS)
+{
+ TSVector ts1 = PG_GETARG_TSVECTOR(0),
+ ts2 = PG_GETARG_TSVECTOR(1),
+ tsout;
+ WordEntry *arr1 = ARRPTR(ts1),
+ *arr2 = ARRPTR(ts2),
+ *arrout;
+ char *data1 = STRPTR(ts1),
+ *data2 = STRPTR(ts2),
+ *dataout;
+ int i_out, i1, i2,
+ curoff = 0,
+ match_count = 0;
+
+ /*
+ * As in the tsvector_delete_arr() we optimize things for case with ts2
+ * significantly smaller than ts1, so at first we find all occurences of
+ * ts2 lexemes in ts1.
+ */
+ int *matched_indices = palloc0(ts2->size*sizeof(int));
+ int *matched_indices_ts2 = palloc0(ts2->size*sizeof(int));
+ for (i2 = 0; i2 < ts2->size; i2++)
+ {
+ char *lex = data2 + arr2[i2].pos;
+ int lex_len = arr2[i2].len;
+ int lex_pos = tsvector_bsearch(ts1, lex, lex_len);
+ int m;
+
+ /*
+ * Since ts2 can contain lexemes, that are not present in ts1
+ * we need to store both indices.
+ * Also both those arrays should be sorted because of lexemes sorting
+ * in tsvector.
+ */
+ if (lex_pos >= 0){
+ m = match_count++;
+ matched_indices[m] = lex_pos;
+ matched_indices_ts2[m] = i2;
+ }
+ }
+
+ /*
+ * In contrast to tsvector_delete_arr() and tsvector_delete_str()
+ * here we should have logic about deleting ts2 positions info from
+ * ts1 position. So here the same logic as in _tsvector_delete(),
+ * but also with positions handling inside main loop.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(ts1));
+ tsout->size = ts1->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ /*
+ * Here we use indices starting with i for iterating through tsvectors and
+ * indices starting from j for iterating through positions.
+ */
+ for (i_out = i1 = i2 = 0; i1 < ts1->size; i1++)
+ {
+ if (i2 < match_count && i1 == matched_indices[i2])
+ {
+ /*
+ * Lexeme matched. If filtering vector has some positions on that
+ * lexeme than we should delete them from filtered vector.
+ */
+ if (arr2[matched_indices_ts2[i2]].haspos && arr1[i1].haspos)
+ {
+ WordEntryPosVector *posv1 = _POSVECPTR(ts1, arr1 + i1);
+ WordEntryPosVector *posv2 = _POSVECPTR(ts2, arr2 + matched_indices_ts2[i2]);
+ int *pos_out = palloc0(posv1->npos*sizeof(int));
+ int j1, j2, j_out;
+
+ /*
+ * Substract (as a set) ts2 lexeme positions from ts1 lexeme positions.
+ * Here we used the fact that both positions arrays already sorted.
+ */
+ for(j1 = j2 = j_out = 0; j1 < posv1->npos; j1++)
+ {
+ while (j2 < posv2->npos &&
+ WEP_GETPOS(posv1->pos[j1]) > WEP_GETPOS(posv2->pos[j2]))
+ j2++;
+
+ if (WEP_GETPOS(posv1->pos[j1]) < WEP_GETPOS(posv2->pos[j2]))
+ pos_out[j_out++] = posv1->pos[j1];
+ }
+
+ /*
+ * If result is not empty than we can copy lexeme and positions.
+ */
+ if (j_out > 0)
+ {
+ WordEntryPosVector *posv_out;
+ int k;
+
+ memcpy(dataout + curoff, data1 + arr1[i1].pos, arr1[i1].len);
+ arrout[i_out].haspos = arr1[i1].haspos;
+ arrout[i_out].len = arr1[i1].len;
+ arrout[i_out].pos = curoff;
+
+ curoff += arr1[i1].len;
+
+ posv_out = _POSVECPTR(tsout, arrout + i_out);
+ posv_out->npos = j_out;
+ for (k = 0; k < posv_out->npos; k++)
+ posv_out->pos[k] = pos_out[k];
+ curoff = SHORTALIGN(curoff);
+ curoff += j_out * sizeof(WordEntryPos) + sizeof(uint16);
+
+ i_out++;
+ }
+
+ pfree(pos_out);
+ }
+
+ i2++;
+ }
+ else
+ {
+ /*
+ * Not a match, we can copy lexeme and positions unchanged.
+ */
+ memcpy(dataout + curoff, data1 + arr1[i1].pos, arr1[i1].len);
+ arrout[i_out].haspos = arr1[i1].haspos;
+ arrout[i_out].len = arr1[i1].len;
+ arrout[i_out].pos = curoff;
+ curoff += arr1[i1].len;
+ if (arr1[i1].haspos)
+ {
+ int len = POSDATALEN(ts1, arr1 + i1) * sizeof(WordEntryPos) + sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff,
+ STRPTR(ts1) + SHORTALIGN(arr1[i1].pos + arr1[i1].len), len);
+ curoff += len;
+ }
+
+ i_out++;
+ }
+ }
+
+ tsout->size = i_out;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, curoff);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+
+ pfree(matched_indices);
+ pfree(matched_indices_ts2);
+ PG_FREE_IF_COPY(ts1, 0);
+ PG_FREE_IF_COPY(ts2, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = (WEP_GETWEIGHT(posv->pos[j]) >> 2) ?
+ 'D' : 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, 1));
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ values[1] = PointerGetDatum(construct_array(NULL, 0, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(construct_array(NULL, 0, TEXTOID, -1, false, 'i'));
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len));
+ }
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char cw = DatumGetChar(dweights[i]);
+ switch (cw)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", cw);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Shift all positions in tsvector by given value.
+ */
+Datum
+tsvector_shift(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ int16 offset = PG_GETARG_INT16(1);
+ WordEntry *arrout;
+ int i;
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ arrout = ARRPTR(tsout);
+
+ for (i = 0; i < tsout->size; i++)
+ {
+ int j, newpos;
+ WordEntryPosVector *posvout = _POSVECPTR(tsout, arrout + i);
+
+ for (j = 0; j < posvout->npos; j++)
+ {
+ newpos = LIMITPOS(WEP_GETPOS(posvout->pos[j]) + offset);
+ WEP_SETPOS(posvout->pos[j], newpos > 0 ? newpos : 0);
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index d8640db..84604dc 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4575,8 +4575,26 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3324 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_delete_tsvector _null_ _null_ _null_ ));
+DESCR("delete lexemes that given as tsvector");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3317 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
+DATA(insert OID = 3325 ( shift PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 23" _null_ _null_ _null_ _null_ _null_ tsvector_shift _null_ _null_ _null_ ));
+DESCR("shift all positions in tsvector by given value");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 281cdd6..f72a718 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..9c55664 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,276 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base'::tsvector);
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:7,9'::tsvector);
+ delete
+-----------------------------------------------------------------------
+ 'base':8 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:7,8,9'::tsvector);
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:1,7,8,9,10'::tsvector);
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship'::tsvector);
+ delete
+----------------------------------------------
+ 'base':7,8,9 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2,33'::tsvector);
+ delete
+---------------------------------------------------------------------
+ 'base':7,8,9 'hidden':6 'rebel':1 'spaceship':34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2,33,34,35,36'::tsvector);
+ delete
+----------------------------------------------
+ 'base':7,8,9 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2B,33C,34B,35C,36B'::tsvector);
+ delete
+----------------------------------------------
+ 'base':7,8,9 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship:2 rebel'::tsvector);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT delete('cat:3 fat:2,4 rat:5A'::tsvector, 'aaa fat:2,6'::tsvector);
+ delete
+--------------------------
+ 'cat':3 'fat':4 'rat':5A
+(1 row)
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+-------------------
+ (base,{},{})
+ (hidden,{},{})
+ (rebel,{},{})
+ (spaceship,{},{})
+ (strike,{},{})
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | {} | {}
+ hidden | {} | {}
+ rebel | {} | {}
+ spaceship | {} | {}
+ strike | {} | {}
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
+SELECT shift('fat:2,4 cat:3 rat:5A'::tsvector, 10);
+ shift
+--------------------------------
+ 'cat':13 'fat':12,14 'rat':15A
+(1 row)
+
+SELECT shift('fat:2,4 cat:3 rat:5A'::tsvector, -3);
+ shift
+----------------------------
+ 'cat':0 'fat':0,1 'rat':2A
+(1 row)
+
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..52a1923 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,58 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:7,9'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:7,8,9'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base:1,7,8,9,10'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2,33'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2,33,34,35,36'::tsvector);
+SELECT delete('base:7,8,9 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship:2B,33C,34B,35C,36B'::tsvector);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship:2 rebel'::tsvector);
+SELECT delete('cat:3 fat:2,4 rat:5A'::tsvector, 'aaa fat:2,6'::tsvector);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+
+SELECT shift('fat:2,4 cat:3 rat:5A'::tsvector, 10);
+SELECT shift('fat:2,4 cat:3 rat:5A'::tsvector, -3);
+
So, Tomas, Teodor, did you like this new version of the patch?
Stas Kelvich wrote:
7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?Because sgml compiler wants unique indexterm. Both functions that you
mentioned use overloading of arguments and have non-unique name.
This sounds wrong. I think what you should really do is use
<indexterm>
<primary>foo</primary>
<secondary>bar</secondary>
</indexterm>
to distinguish the two entries.
It's a bit funny that you reintroduce the "unrecognized weight: %d"
(instead of %c) in tsvector_setweight_by_filter.
--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
On 12/30/2015 06:49 PM, Stas Kelvich wrote:
Hi, Tom�! Thanks for comprehensive review.
On 15 Dec 2015, at 06:07, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
1) It's a bit difficult to judge the usefulness of the API, as I've
always been a mere user of full-text search, and I never had a need
(or courage) to mess with the tsvectors. OTOH I don't see a good
reason no to have such API, when there's a need for it.The API seems to be reasonably complete, with one exception - when
looking at editing function we have for 'hstore', we do have these
variants for delete()delete(hstore,text)
delete(hstore,text[])
delete(hstore,hstore)while this patch only adds delete(tsvector,text). Would it make
sense to add variants similar to hstore? It probably does not make
much sense to add delete(tsvector,tsvector), right? But being able
to delete a bunch of lexemes in one go seems like a good thing.What do you think?
That�s a good idea and actually deleting tsvector from tsvector makes perfect sense. In delete function I used exact string match between string and lexemes in tsvector, but if somebody wants to delete for example �Cats� from tsvector, then he should downcase and singularize this word. Easiest way to do it is to just use to_tsvector() function. Also we can use this function to delete specific positions: like delete('cat:3 fat:2,4'::tsvector, 'fat:2'::tsvector) -> 'cat:3 fat:4'::tsvector.
So in attached patch I�ve implemented following:
delete(tsin tsvector, lexarrtext[]) � remove any occurence of lexemes inlexarr from tsin
OK, although I do recommend using more sensible variable names, i.e. why
how to use 'lexemes' instead of 'lexarr' for example? Similarly for the
other functions.
delete(tsin tsvector, tsv_filter tsvector) � Delete lexemes and/or positions of tsv_filter from tsin. When lexeme in tsv_filter has no positions function will delete any occurrence of same lexeme in tsin. When tsv_filter lexeme have positions function will delete them from positions of matching lexeme in tsin. If after such removal resulting positions set is empty then function will delete that lexeme from resulting tsvector.
I can't really imagine situation in which I'd need this, but if you do
have a use case for it ... although in the initial paragraph you say
"... but if somebody wants to delete for example ..." which suggests you
may not have such use case.
Based on bad experience with extending API based on vague ideas, I
recommend only really adding functions with existing need. It's easy to
add a function later, much more difficult to remove it or change the
signature.
Also if we want some level of completeness of API and taking into account that concat() function shift positions on second argument I thought that it can be useful to also add function that can shift all positions of specific value. This helps to undo concatenation: delete one of concatenating tsvectors and then shift positions in resulting tsvector. So I also wrote one another small function:
shift(tsin tsvector,offset int16) � Shift all positions in tsin by given offset
That seems rather too low-level. Shouldn't it be really built into
delete() directly somehow?
4) I find it rather annoying that there are pretty much no comments in
the code. Granted, there are pretty much no comments in the
surrounding code, but I doubt that's a good reason for not having
any comments in new code. It makes reviews unnecessarily difficult.Fixed, I think.
Yep, much better now.
5) tsvector_concat() is not mentioned in docs at all
Concat mentioned in docs as an operator ||.
Ah, OK.
6) Docs don't mention names of the new parameters in function
signatures, just data types. The functions with a single parameter
probably don't need to do that, but multi-parameter ones should.Fixed.
OK, but please let's use variable names clearly identifying the meaning.
So not 'w' but 'weight' and so on.
7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?Because sgml compiler wants unique indexterm. Both functions that
youmentioned use overloading of arguments and have non-unique name.
As Michael pointed out, that should probably be handled by using
<primary> and <secondary> tags.
regards
--
Tomas Vondra http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Hi
On 22 Jan 2016, at 19:03, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
OK, although I do recommend using more sensible variable names, i.e. why how to use 'lexemes' instead of 'lexarr' for example? Similarly for the other functions.
Changed. With old names I tried to follow conventions in surrounding code, but probably that is a good idea to switch to more meaningful names in new code.
delete(tsin tsvector, tsv_filter tsvector) — Delete lexemes and/or positions of tsv_filter from tsin. When lexeme in tsv_filter has no positions function will delete any occurrence of same lexeme in tsin. When tsv_filter lexeme have positions function will delete them from positions of matching lexeme in tsin. If after such removal resulting positions set is empty then function will delete that lexeme from resulting tsvector.
I can't really imagine situation in which I'd need this, but if you do have a use case for it ... although in the initial paragraph you say "... but if somebody wants to delete for example ..." which suggests you may not have such use case.
Based on bad experience with extending API based on vague ideas, I recommend only really adding functions with existing need. It's easy to add a function later, much more difficult to remove it or change the signature.
I tried to create more or less self-contained api, e.g. have ability to negate effect of concatenation. But i’ve also asked people around what they think about extending API and everybody convinced that it is better to stick to smaller API. So let’s drop it. At least that functions exists in mail list in case if somebody will google for such kind of behaviour.
Also if we want some level of completeness of API and taking into account that concat() function shift positions on second argument I thought that it can be useful to also add function that can shift all positions of specific value. This helps to undo concatenation: delete one of concatenating tsvectors and then shift positions in resulting tsvector. So I also wrote one another small function:
shift(tsin tsvector,offset int16) — Shift all positions in tsin by given offset
That seems rather too low-level. Shouldn't it be really built into delete() directly somehow?
I think it is ambiguous task on delete. But if we are dropping support of delete(tsvector, tsvector) I don’t see points in keeping that functions.
7) Some of the functions use intexterm that does not match the function
name. I see two such cases - to_tsvector and setweight. Is there a
reason for that?Because sgml compiler wants unique indexterm. Both functions that
youmentioned use overloading of arguments and have non-unique name.As Michael pointed out, that should probably be handled by using <primary> and <secondary> tags.
Done.
On 19 Jan 2016, at 00:21, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:
It's a bit funny that you reintroduce the "unrecognized weight: %d"
(instead of %c) in tsvector_setweight_by_filter.
Ah, I was thinking about moving it to separate diff and messed. Fixed and attaching diff with same fix for old tsvector_setweight.
Attachments:
tsvector_ops-v2.1.diffapplication/octet-stream; name=tsvector_ops-v2.1.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9c143b2..54601bf 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9168,16 +9168,29 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to each element of <replaceable class="PARAMETER">tsin</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight</primary>
+ <secondary>setweight by filter</secondary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to elements of <replaceable class="PARAMETER">tsin</replaceable> that are listed in <replaceable class="PARAMETER">lexemes</replaceable> array</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9190,6 +9203,84 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexeme</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexemes array</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexemes</replaceable> array from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {A}
+fat {2,4} {D,D}
+rat {5} {A}
+(3 rows)</literallayout></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_tsvector</primary>
+ <secondary>array to tsvector</secondary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">tsin</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">tsin</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index d66b4d5..32033fa 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c361..cb4acb1 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,80 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexarr.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = NULL;
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ lexemes = PG_GETARG_ARRAYTYPE_P(2);
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ int lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +367,440 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+_tsvector_delete(TSVector tsv, int *indices_to_delete, int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in indices_to_delete
+ * or not. Since indices_to_delete is already sorted we can advance
+ * it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff, (STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ char *lexeme = VARDATA(PG_GETARG_TEXT_P(1));
+ int lexeme_len = VARSIZE_ANY_EXHDR(PG_GETARG_TEXT_P(1)),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = _tsvector_delete(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex*sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ int lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = _tsvector_delete(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = (WEP_GETWEIGHT(posv->pos[j]) >> 2) ?
+ 'D' : 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, 1));
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ values[1] = PointerGetDatum(construct_array(NULL, 0, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(construct_array(NULL, 0, TEXTOID, -1, false, 'i'));
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len));
+ }
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 79e92ff..cacf8fe 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4476,8 +4476,22 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3326 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index dc6067a..e70a303 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..9986ce4 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,204 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+-------------------
+ (base,{},{})
+ (hidden,{},{})
+ (rebel,{},{})
+ (spaceship,{},{})
+ (strike,{},{})
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | {} | {}
+ hidden | {} | {}
+ rebel | {} | {}
+ spaceship | {} | {}
+ strike | {} | {}
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..cef5f46 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,44 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+
tsvector_ops-v2.2.diffapplication/octet-stream; name=tsvector_ops-v2.2.diffDownload
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index cb4acb1..42dfb5f 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -228,7 +228,7 @@ tsvector_setweight(PG_FUNCTION_ARGS)
break;
default:
/* internal error */
- elog(ERROR, "unrecognized weight: %d", cw);
+ elog(ERROR, "unrecognized weight: %c", cw);
}
out = (TSVector) palloc(VARSIZE(in));
Some notices:
1 tsin in documentation doesn't look like a good name. Changed to vector similar
to other places.
2 I did some editorization about freeing memory/forgotten names etc
3 It seems to me that tsvector_unnest() could be seriously optimized for
large tsvectors: with current coding it detoasts/decompresses tsvector value on
each call. Much better to do it once in
multi_call_memory_ctx context at first call init
4 It seems debatable returning empty array for position/weight if they are absent:
=# select * from unnest('a:1 b'::tsvector);
lexeme | positions | weights
--------+-----------+---------
a | {1} | {D}
b | {} | {}
I think, it's better to return NULL in this case
5
array_to_tsvector/tsvector_setweight_by_filter/tsvector_delete_arr/tsvector_filter
doesn't check or pay attention to NULL elements in input arrays
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Some notices:
1 tsin in documentation doesn't look like a good name. Changed to vector similar
to other places.2 I did some editorization about freeing memory/forgotten names etc
Ooops, forgot to attach
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
Attachments:
tsvector_ops-v3.1.difftext/x-patch; name=tsvector_ops-v3.1.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 139aa2b..9c294e3 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9168,16 +9168,29 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to each element of <replaceable class="PARAMETER">vector</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight</primary>
+ <secondary>setweight by filter</secondary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to elements of <replaceable class="PARAMETER">vector</replaceable> that are listed in <replaceable class="PARAMETER">lexemes</replaceable> array</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9190,6 +9203,84 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexeme</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexemes array</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexemes</replaceable> array from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {A}
+fat {2,4} {D,D}
+rat {5} {A}
+(3 rows)</literallayout></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_tsvector</primary>
+ <secondary>array to tsvector</secondary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index d66b4d5..32033fa 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c361..e7ea270 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,81 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ int lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +368,445 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ if (indices_count > 1)
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in indices_to_delete
+ * or not. Since indices_to_delete is already sorted we can advance
+ * it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff, (STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_P(1);
+ char *lexeme = VARDATA(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ int lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = (WEP_GETWEIGHT(posv->pos[j]) >> 2) ?
+ 'D' : 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, 1));
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ values[1] = PointerGetDatum(construct_array(NULL, 0, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(construct_array(NULL, 0, TEXTOID, -1, false, 'i'));
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len));
+ }
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index ba8760b..9f3be57 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4476,8 +4476,22 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3326 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index dc6067a..e70a303 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..9986ce4 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,204 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+-------------------
+ (base,{},{})
+ (hidden,{},{})
+ (rebel,{},{})
+ (spaceship,{},{})
+ (strike,{},{})
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | {} | {}
+ hidden | {} | {}
+ rebel | {} | {}
+ spaceship | {} | {}
+ strike | {} | {}
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..cef5f46 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,44 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+
On 02 Feb 2016, at 20:10, Teodor Sigaev <teodor@sigaev.ru> wrote:
Some notices:
1 tsin in documentation doesn't look like a good name. Changed to vector similar to other places.
2 I did some editorization about freeing memory/forgotten names etc
Thanks.
3 It seems to me that tsvector_unnest() could be seriously optimized for
large tsvectors: with current coding it detoasts/decompresses tsvector value on each call. Much better to do it once in
multi_call_memory_ctx context at first call init
Done, moved detoasting to first SRF call.
4 It seems debatable returning empty array for position/weight if they are absent:
=# select * from unnest('a:1 b'::tsvector);
lexeme | positions | weights
--------+-----------+---------
a | {1} | {D}
b | {} | {}
I think, it's better to return NULL in this case
Okay, done.
5 array_to_tsvector/tsvector_setweight_by_filter/tsvector_delete_arr/tsvector_filter doesn't check or pay attention to NULL elements in input arrays
Thanks! Fixed and added tests.
Show quoted text
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers
Attachments:
tsvector_ops-v4.diffapplication/octet-stream; name=tsvector_ops-v4.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index f9eea76..c53a551 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9211,16 +9211,29 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to each element of <replaceable class="PARAMETER">vector</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight</primary>
+ <secondary>setweight by filter</secondary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to elements of <replaceable class="PARAMETER">vector</replaceable> that are listed in <replaceable class="PARAMETER">lexemes</replaceable> array</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9233,6 +9246,84 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexeme</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexemes array</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexemes</replaceable> array from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>setof anyelement</type></entry>
+ <entry>expand a tsvector to a set of rows. Each row has following columns: lexeme, postings, weights.</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literallayout class="monospaced">cat {3} {A}
+fat {2,4} {D,D}
+rat {5} {A}
+(3 rows)</literallayout></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_tsvector</primary>
+ <secondary>array to tsvector</secondary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index d66b4d5..32033fa 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c361..5c9f788 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +377,470 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ if (indices_count > 1)
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in indices_to_delete
+ * or not. Since indices_to_delete is already sorted we can advance
+ * it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff, (STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len)), len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_P(1);
+ char *lexeme = VARDATA(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ funcctx->user_fctx = PG_GETARG_TSVECTOR(0);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ tsin = (TSVector) funcctx->user_fctx;
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = (WEP_GETWEIGHT(posv->pos[j]) >> 2) ?
+ 'D' : 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight, 1));
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ nulls[1] = nulls[2] = true;
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ PG_FREE_IF_COPY(tsin, 0);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len));
+ }
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+ }
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)(dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 1c0ef9a..1df8e49 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4490,8 +4490,22 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3326 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3318 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index dc6067a..e70a303 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..e409648 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,212 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+---------------
+ (base,,)
+ (hidden,,)
+ (rebel,,)
+ (spaceship,,)
+ (strike,,)
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | |
+ hidden | |
+ rebel | |
+ spaceship | |
+ strike | |
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+ERROR: name or argument lists may not contain nulls
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..f21efbb 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,48 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+
Thanks! Fixed and added tests.
Thank you!
I did some patch cleanup/fix, but I have some doubt with function's names:
1 to_tsvector:
# \df to_tsvector
List of functions
Schema | Name | Result data type | Argument data types | Type
------------+-------------+------------------+---------------------+--------
pg_catalog | to_tsvector | tsvector | regconfig, text | normal
pg_catalog | to_tsvector | tsvector | text | normal
pg_catalog | to_tsvector | tsvector | text[] | normal
First two variants of to_tsvector make a morphological processing, last one doesn't.
2 to_array
# \df *to_array
List of functions
Schema | Name | Result data type | Argument data types |
Type
------------+-----------------------+------------------+---------------------+--------
pg_catalog | regexp_split_to_array | text[] | text, text |
normal
pg_catalog | regexp_split_to_array | text[] | text, text, text |
normal
pg_catalog | string_to_array | text[] | text, text |
normal
pg_catalog | string_to_array | text[] | text, text, text |
normal
pg_catalog | to_array | text[] | tsvector |
normal
Seems, to_array is not a right name compared to other *to_array.
I would like to suggest rename both functions to array_to_tsvector and
tsvector_to_array to have consistent name. Later we could add
to_tsvector([regconfig, ], text[]) with morphological processing.
Thoughts?
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
Attachments:
tsvector_ops-v5.difftext/x-patch; name=tsvector_ops-v5.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4b5ee81..ed0b6be 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9211,16 +9211,29 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to each element of <replaceable class="PARAMETER">vector</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight</primary>
+ <secondary>setweight by filter</secondary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to elements of <replaceable class="PARAMETER">vector</replaceable> that are listed in <replaceable class="PARAMETER">lexemes</replaceable> array</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9233,6 +9246,81 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexeme</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexemes array</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexemes</replaceable> array from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>, OUT <replaceable class="PARAMETER">lexeme</> <type>text</>, OUT <replaceable class="PARAMETER">positions</> <type>smallint[]</>, OUT <replaceable class="PARAMETER">weights</> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>setof record</type></entry>
+ <entry>expand a tsvector to a set of rows</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>(cat,{3},{D}) ...</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_array</primary>
+ </indexterm>
+ <literal><function>to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>to_tsvector</primary>
+ <secondary>array to tsvector</secondary>
+ </indexterm>
+ <literal><function>to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index ff99976..ea3abc9 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c361..77668cb 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
+ int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ if (indices_count > 1)
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in
+ * indices_to_delete or not. Since indices_to_delete is already
+ * sorted we can advance it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff,
+ STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
+ len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_P(1);
+ char *lexeme = VARDATA(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ tsin = (TSVector) funcctx->user_fctx;
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(
+ cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
+ );
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(
+ cstring_to_text_with_len(&weight, 1)
+ );
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ nulls[1] = nulls[2] = true;
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ {
+ pfree(tsin);
+ SRF_RETURN_DONE(funcctx);
+ }
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
+ );
+ }
+
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+ }
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)
+ (dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index a0f821a..5a5f60a 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4496,8 +4496,22 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3326 ( to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3327 ( to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index dc6067a..e70a303 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..e409648 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,212 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+---------------
+ (base,,)
+ (hidden,,)
+ (rebel,,)
+ (spaceship,,)
+ (strike,,)
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | |
+ hidden | |
+ rebel | |
+ spaceship | |
+ strike | |
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+ to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+ERROR: name or argument lists may not contain nulls
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..f21efbb 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,48 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+SELECT to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+
On 10 Mar 2016, at 20:29, Teodor Sigaev <teodor@sigaev.ru> wrote:
I would like to suggest rename both functions to array_to_tsvector and tsvector_to_array to have consistent name. Later we could add to_tsvector([regconfig, ], text[]) with morphological processing.
Thoughts?
Seems reasonable, done.
Attachments:
tsvector_ops-v6.diffapplication/octet-stream; name=tsvector_ops-v6.diffDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4b5ee81..000489d 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9211,16 +9211,29 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<indexterm>
<primary>setweight</primary>
</indexterm>
- <literal><function>setweight(<type>tsvector</>, <type>"char"</>)</function></literal>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>)</function></literal>
</entry>
<entry><type>tsvector</type></entry>
- <entry>assign weight to each element of <type>tsvector</></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to each element of <replaceable class="PARAMETER">vector</replaceable></entry>
<entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A')</literal></entry>
<entry><literal>'cat':3A 'fat':2A,4A 'rat':5A</literal></entry>
</row>
<row>
<entry>
<indexterm>
+ <primary>setweight</primary>
+ <secondary>setweight by filter</secondary>
+ </indexterm>
+ <literal><function>setweight(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weight</replaceable> <type>"char"</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>"text"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>assign <replaceable class="PARAMETER">weight</replaceable> to elements of <replaceable class="PARAMETER">vector</replaceable> that are listed in <replaceable class="PARAMETER">lexemes</replaceable> array</entry>
+ <entry><literal>setweight('fat:2,4 cat:3 rat:5B'::tsvector, 'A', '{cat,rat}')</literal></entry>
+ <entry><literal>'cat':3A 'fat':2,4 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>strip</primary>
</indexterm>
<literal><function>strip(<type>tsvector</>)</function></literal>
@@ -9233,6 +9246,80 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
<row>
<entry>
<indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexeme</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexeme</replaceable> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove given <replaceable class="PARAMETER">lexeme</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat')</literal></entry>
+ <entry><literal>'cat':3 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>delete</primary>
+ <secondary>delete lemexemes array</secondary>
+ </indexterm>
+ <literal><function>delete(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">lexemes</replaceable> <type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>remove any occurrence of lexemes in <replaceable class="PARAMETER">lexemes</replaceable> array from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat'])</literal></entry>
+ <entry><literal>'cat':3</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>unnest</primary>
+ </indexterm>
+ <literal><function>unnest(<type>tsvector</>, OUT <replaceable class="PARAMETER">lexeme</> <type>text</>, OUT <replaceable class="PARAMETER">positions</> <type>smallint[]</>, OUT <replaceable class="PARAMETER">weights</> <type>text</>)</function></literal>
+ </entry>
+ <entry><type>setof record</type></entry>
+ <entry>expand a tsvector to a set of rows</entry>
+ <entry><literal>unnest('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>(cat,{3},{D}) ...</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>tsvector_to_array</primary>
+ </indexterm>
+ <literal><function>tsvector_to_array(<type>tsvector</>)</function></literal>
+ </entry>
+ <entry><type>text[]</type></entry>
+ <entry>convert <type>tsvector</> to array of lexemes</entry>
+ <entry><literal>tsvector_to_array('fat:2,4 cat:3 rat:5A'::tsvector)</literal></entry>
+ <entry><literal>{cat,fat,rat}</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>array_to_tsvector</primary>
+ </indexterm>
+ <literal><function>array_to_tsvector(<type>text[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>convert array of lexemes to <type>tsvector</type></entry>
+ <entry><literal>array_to_tsvector('{fat,cat,rat}'::text[])</literal></entry>
+ <entry><literal>'fat' 'cat' 'rat'</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
+ <primary>filter</primary>
+ </indexterm>
+ <literal><function>filter(<replaceable class="PARAMETER">vector</replaceable> <type>tsvector</>, <replaceable class="PARAMETER">weights</replaceable> <type>"char"[]</>)</function></literal>
+ </entry>
+ <entry><type>tsvector</type></entry>
+ <entry>Select only elements with given <replaceable class="PARAMETER">weights</replaceable> from <replaceable class="PARAMETER">vector</replaceable></entry>
+ <entry><literal>filter('fat:2,4 cat:3b rat:5A'::tsvector, '{a,b}')</literal></entry>
+ <entry><literal>'cat':3B 'rat':5A</literal></entry>
+ </row>
+ <row>
+ <entry>
+ <indexterm>
<primary>to_tsquery</primary>
</indexterm>
<literal><function>to_tsquery(<optional> <replaceable class="PARAMETER">config</> <type>regconfig</> , </optional> <replaceable class="PARAMETER">query</> <type>text</type>)</function></literal>
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index ff99976..ea3abc9 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1326,6 +1326,10 @@ FROM (SELECT id, body, q, ts_rank_cd(ti, q) AS rank
</variablelist>
+ <para>
+ Full list of <type>tsvector</>-related functions available in <xref linkend="textsearch-functions-table">.
+ </para>
+
</sect2>
<sect2 id="textsearch-manipulate-tsquery">
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c361..77668cb 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
+ int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ if (indices_count > 1)
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in
+ * indices_to_delete or not. Since indices_to_delete is already
+ * sorted we can advance it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff,
+ STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
+ len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_P(1);
+ char *lexeme = VARDATA(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ tsin = (TSVector) funcctx->user_fctx;
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(
+ cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
+ );
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(
+ cstring_to_text_with_len(&weight, 1)
+ );
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ nulls[1] = nulls[2] = true;
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ {
+ pfree(tsin);
+ SRF_RETURN_DONE(funcctx);
+ }
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
+ );
+ }
+
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+ }
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("name or argument lists may not contain nulls")));
+
+ char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)
+ (dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 451bad7..5c71bce 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4498,8 +4498,22 @@ DESCR("number of lexemes");
DATA(insert OID = 3623 ( strip PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_strip _null_ _null_ _null_ ));
DESCR("strip position information");
DATA(insert OID = 3624 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 18" _null_ _null_ _null_ _null_ _null_ tsvector_setweight _null_ _null_ _null_ ));
-DESCR("set weight of lexeme's entries");
-DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DESCR("set given weight for whole tsvector");
+DATA(insert OID = 3320 ( setweight PGNSP PGUID 12 1 0 0 0 f f f f t f i s 3 0 3614 "3614 18 1009" _null_ _null_ _null_ _null_ _null_ tsvector_setweight_by_filter _null_ _null_ _null_ ));
+DESCR("set given weight for given lexemes");
+DATA(insert OID = 3625 ( tsvector_concat PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 3614" _null_ _null_ _null_ _null_ _null_ tsvector_concat _null_ _null_ _null_ ));
+DATA(insert OID = 3321 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 25" _null_ _null_ _null_ _null_ _null_ tsvector_delete_str _null_ _null_ _null_ ));
+DESCR("delete lexeme");
+DATA(insert OID = 3323 ( delete PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1009" _null_ _null_ _null_ _null_ _null_ tsvector_delete_arr _null_ _null_ _null_ ));
+DESCR("delete given lexemes");
+DATA(insert OID = 3322 ( unnest PGNSP PGUID 12 1 10 0 0 f f f f t t i s 1 0 2249 "3614" "{3614,25,1005,1009}" "{i,o,o,o}" "{tsvector,lexeme,positions,weights}" _null_ _null_ tsvector_unnest _null_ _null_ _null_ ));
+DESCR("expand tsvector to set of rows");
+DATA(insert OID = 3326 ( tsvector_to_array PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 1009 "3614" _null_ _null_ _null_ _null_ _null_ tsvector_to_array _null_ _null_ _null_ ));
+DESCR("convert to lexeme's array");
+DATA(insert OID = 3327 ( array_to_tsvector PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3614 "1009" _null_ _null_ _null_ _null_ _null_ array_to_tsvector _null_ _null_ _null_ ));
+DESCR("build tsvector from lexeme's array");
+DATA(insert OID = 3319 ( filter PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3614 "3614 1002" _null_ _null_ _null_ _null_ _null_ tsvector_filter _null_ _null_ _null_ ));
+DESCR("returns tsvector that contain only postings with given weights");
DATA(insert OID = 3634 ( ts_match_vq PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3614 3615" _null_ _null_ _null_ _null_ _null_ ts_match_vq _null_ _null_ _null_ ));
DATA(insert OID = 3635 ( ts_match_qv PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3615 3614" _null_ _null_ _null_ _null_ _null_ ts_match_qv _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index dc6067a..e70a303 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -141,7 +141,16 @@ extern Datum tsvector_cmp(PG_FUNCTION_ARGS);
extern Datum tsvector_length(PG_FUNCTION_ARGS);
extern Datum tsvector_strip(PG_FUNCTION_ARGS);
extern Datum tsvector_setweight(PG_FUNCTION_ARGS);
+extern Datum tsvector_setweight_by_filter(PG_FUNCTION_ARGS);
extern Datum tsvector_concat(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_str(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_arr(PG_FUNCTION_ARGS);
+extern Datum tsvector_delete_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_unnest(PG_FUNCTION_ARGS);
+extern Datum tsvector_to_array(PG_FUNCTION_ARGS);
+extern Datum array_to_tsvector(PG_FUNCTION_ARGS);
+extern Datum tsvector_filter(PG_FUNCTION_ARGS);
+extern Datum tsvector_shift(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_byid(PG_FUNCTION_ARGS);
extern Datum tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index 6284fb6..dfd4404 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -83,18 +83,6 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
'a':3A,4B 'b':2A 'ba':1237
(1 row)
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
- setweight
-----------------------------------------------------------
- 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
-(1 row)
-
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
- strip
----------------
- 'a' 'asd' 'w'
-(1 row)
-
--Base tsquery test
SELECT '1'::tsquery;
tsquery
@@ -625,3 +613,212 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
0.1
(1 row)
+-- tsvector editing operations
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+ strip
+---------------
+ 'a' 'asd' 'w'
+(1 row)
+
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+ strip
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+ delete
+--------------------------------------------------------------
+ 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+ delete
+-----------------------------------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+ delete
+------------------------------------------
+ 'base':7 'hidden':6 'rebel':1 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+ delete
+----------------------------------
+ 'base' 'hidden' 'rebel' 'strike'
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+ delete
+-------------------------------------------------------------
+ 'base':7 'hidden':6 'spaceship':2,33A,34B,35C,36 'strike':3
+(1 row)
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------------
+ 'base':7 'hidden':6 'strike':3
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+ delete
+--------------------------
+ 'base' 'hidden' 'strike'
+(1 row)
+
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ unnest
+---------------------------------------------
+ (base,{7},{D})
+ (hidden,{6},{D})
+ (rebel,{1},{D})
+ (spaceship,"{2,33,34,35,36}","{D,A,B,C,D}")
+ (strike,{3},{D})
+(5 rows)
+
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+ unnest
+---------------
+ (base,,)
+ (hidden,,)
+ (rebel,,)
+ (spaceship,,)
+ (strike,,)
+(5 rows)
+
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions | weights
+-----------+-----------------+-------------
+ base | {7} | {D}
+ hidden | {6} | {D}
+ rebel | {1} | {D}
+ spaceship | {2,33,34,35,36} | {D,A,B,C,D}
+ strike | {3} | {D}
+(5 rows)
+
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+ lexeme | positions | weights
+-----------+-----------+---------
+ base | |
+ hidden | |
+ rebel | |
+ spaceship | |
+ strike | |
+(5 rows)
+
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ lexeme | positions
+-----------+-----------
+ base | 7
+ hidden | 6
+ rebel | 1
+ spaceship | 2
+ strike | 3
+(5 rows)
+
+SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+ tsvector_to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);
+ tsvector_to_array
+--------------------------------------
+ {base,hidden,rebel,spaceship,strike}
+(1 row)
+
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+ array_to_tsvector
+----------------------------------------------
+ 'base' 'hidden' 'rebel' 'spaceship' 'strike'
+(1 row)
+
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+ setweight
+----------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5C,6C,12C,13C 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+ setweight
+------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81,222A,567
+(1 row)
+
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+ setweight
+--------------------------------------------------------
+ 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+ setweight
+---------------------------------
+ 'a' 'asd' 'w':5,6,12B,13A 'zxc'
+(1 row)
+
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+ERROR: name or argument lists may not contain nulls
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+ filter
+-------------------------------------------------------------
+ 'base':7A 'hidden':6A 'rebel':1A 'spaceship':2A 'strike':3A
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+ filter
+--------
+
+(1 row)
+
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+ERROR: name or argument lists may not contain nulls
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index fd7c702..db62c54 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -14,8 +14,6 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector;
SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector));
SELECT '''w'':4A,3B,2C,1D,5 a:8';
SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B';
-SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
-SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
--Base tsquery test
SELECT '1'::tsquery;
@@ -115,3 +113,48 @@ SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a | s');
SELECT ts_rank_cd(' a:1 s:2C d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2B d g'::tsvector, 'a & s');
SELECT ts_rank_cd(' a:1 s:2 d g'::tsvector, 'a & s');
+
+-- tsvector editing operations
+
+SELECT strip('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd'::tsvector);
+SELECT strip('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT strip('base hidden rebel spaceship strike'::tsvector);
+
+SELECT delete(to_tsvector('english', 'Rebel spaceships, striking from a hidden base'), 'spaceship');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'base');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bas');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'bases');
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, 'spaceship');
+SELECT delete('base hidden rebel spaceship strike'::tsvector, 'spaceship');
+
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceships','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
+SELECT delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
+SELECT delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
+
+SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT * FROM unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT * FROM unnest('base hidden rebel spaceship strike'::tsvector);
+SELECT lexeme, positions[1] from unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+
+SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);
+SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector);
+
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']);
+SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]);
+
+SELECT setweight('w:12B w:13* w:12,5,6 a:1,3* a:3 w asd:1dc asd zxc:81,567,222A'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}');
+SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}');
+SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]);
+
+SELECT filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a}');
+SELECT filter('base hidden rebel spaceship strike'::tsvector, '{a,b,NULL}');
+
On 11 Mar 2016, at 16:13, Stas Kelvich <s.kelvich@postgrespro.ru> wrote:
On 10 Mar 2016, at 20:29, Teodor Sigaev <teodor@sigaev.ru> wrote:
I would like to suggest rename both functions to array_to_tsvector and tsvector_to_array to have consistent name. Later we could add to_tsvector([regconfig, ], text[]) with morphological processing.
Thoughts?
Hi, thanks for commit.
I saw errors on windows, here is the fix:
Attachments:
tsvector.fix.patchapplication/octet-stream; name=tsvector.fix.patchDownload
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 6a01276..97df9cc 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -66,7 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
-static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
+static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -684,10 +684,12 @@ tsvector_to_array(PG_FUNCTION_ARGS)
{
TSVector tsin = PG_GETARG_TSVECTOR(0);
WordEntry *arrin = ARRPTR(tsin);
- Datum elements[tsin->size];
+ Datum *elements;
int i;
ArrayType *array;
+ elements = palloc(tsin->size * sizeof(Datum));
+
for (i = 0; i < tsin->size; i++)
{
elements[i] = PointerGetDatum(
@@ -696,6 +698,7 @@ tsvector_to_array(PG_FUNCTION_ARGS)
}
array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ pfree(elements);
PG_FREE_IF_COPY(tsin, 0);
PG_RETURN_POINTER(array);
}
I saw errors on windows, here is the fix:
Thank you, pushed
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers