phrase search
I have attached a patch for phrase search with respect to the cvs head.
Basically it takes a a phrase (text) and a TSVector. It checks if the
relative positions of lexeme in the phrase are same as in their
positions in TSVector.
If the configuration for text search is "simple", then this will produce
exact phrase search. Otherwise the stopwords in a phrase will be ignored
and the words in a phrase will only be matched with the stemmed lexeme.
For my application I am using this as a separate shared object. I do not
know how to expose this function from the core. Can someone explain how
to do this?
I saw this discussion on phrase search and I am not sure what other
functionality is required.
http://archives.postgresql.org/pgsql-general/2008-02/msg01170.php
-Sushant.
Attachments:
phrase_search.patchtext/x-patch; charset=UTF-8; name=phrase_search.patchDownload
Index: src/backend/utils/adt/Makefile
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/utils/adt/Makefile,v
retrieving revision 1.69
diff -u -r1.69 Makefile
--- src/backend/utils/adt/Makefile 19 Feb 2008 10:30:08 -0000 1.69
+++ src/backend/utils/adt/Makefile 31 May 2008 19:57:34 -0000
@@ -29,7 +29,7 @@
tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \
tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \
tsvector.o tsvector_op.o tsvector_parser.o \
- txid.o uuid.o xml.o
+ txid.o uuid.o xml.o phrase_search.o
like.o: like.c like_match.c
Index: src/backend/utils/adt/phrase_search.c
===================================================================
RCS file: src/backend/utils/adt/phrase_search.c
diff -N src/backend/utils/adt/phrase_search.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/backend/utils/adt/phrase_search.c 31 May 2008 19:56:59 -0000
@@ -0,0 +1,167 @@
+#include "postgres.h"
+
+#include "tsearch/ts_type.h"
+#include "tsearch/ts_utils.h"
+
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+PG_FUNCTION_INFO_V1(is_phrase_present);
+Datum is_phrase_present(PG_FUNCTION_ARGS);
+
+typedef struct {
+ WordEntryPosVector *posVector;
+ int4 posInPhrase;
+ int4 curpos;
+} PhraseInfo;
+
+static int
+WordCompareVectorEntry(char *eval, WordEntry *ptr, ParsedWord *prsdword)
+{
+ if (ptr->len == prsdword->len)
+ return strncmp(
+ eval + ptr->pos,
+ prsdword->word,
+ prsdword->len);
+
+ return (ptr->len > prsdword->len) ? 1 : -1;
+}
+
+/*
+ * Returns a pointer to a WordEntry from tsvector t corresponding to prsdword.
+ * Returns NULL if not found.
+ */
+static WordEntry *
+find_wordentry_prsdword(TSVector t, ParsedWord *prsdword)
+{
+ WordEntry *StopLow = ARRPTR(t);
+ WordEntry *StopHigh = (WordEntry *) STRPTR(t);
+ WordEntry *StopMiddle;
+ int difference;
+
+ /* Loop invariant: StopLow <= item < StopHigh */
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ difference = WordCompareVectorEntry(STRPTR(t), StopMiddle, prsdword);
+ if (difference == 0)
+ return StopMiddle;
+ else if (difference < 0)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ return NULL;
+}
+
+
+static int4
+check_and_advance(int4 i, PhraseInfo *phraseInfo)
+{
+ WordEntryPosVector *posvector1, *posvector2;
+ int4 diff;
+
+ posvector1 = phraseInfo[i].posVector;
+ posvector2 = phraseInfo[i+1].posVector;
+
+ diff = phraseInfo[i+1].posInPhrase - phraseInfo[i].posInPhrase;
+ while (posvector2->pos[phraseInfo[i+1].curpos] - posvector1->pos[phraseInfo[i].curpos] < diff)
+ if (phraseInfo[i+1].curpos >= posvector2->npos - 1)
+ return 2;
+ else
+ phraseInfo[i+1].curpos += 1;
+
+ if (posvector2->pos[phraseInfo[i+1].curpos] - posvector1->pos[phraseInfo[i].curpos] == diff)
+ return 1;
+ else
+ return 0;
+}
+
+int4
+initialize_phraseinfo(ParsedText *prs, TSVector t, PhraseInfo *phraseInfo)
+{
+ WordEntry *entry;
+ int4 i;
+
+ for (i = 0; i < prs->curwords; i++)
+ {
+ phraseInfo[i].posInPhrase = prs->words[i].pos.pos;
+ entry = find_wordentry_prsdword(t, &(prs->words[i]));
+ if (entry == NULL)
+ return 0;
+ else
+ phraseInfo[i].posVector = _POSVECPTR(t, entry);
+ }
+ return 1;
+}
+Datum
+is_phrase_present(PG_FUNCTION_ARGS)
+{
+ ParsedText prs;
+ int4 numwords, i, retval, found = 0;
+ PhraseInfo *phraseInfo;
+ text *phrase = PG_GETARG_TEXT_P(0);
+ TSVector t = PG_GETARG_TSVECTOR(1);
+ Oid cfgId = getTSCurrentConfig(true);
+
+ prs.lenwords = (VARSIZE(phrase) - VARHDRSZ) / 6; /* just estimation of * word's number */
+ if (prs.lenwords == 0)
+ prs.lenwords = 2;
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc0(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(cfgId, &prs, VARDATA(phrase), VARSIZE(phrase) - VARHDRSZ);
+
+ // allocate & initialize
+ numwords = prs.curwords;
+ phraseInfo = palloc0(numwords * sizeof(PhraseInfo));
+
+
+ if (numwords > 0 && initialize_phraseinfo(&prs, t, phraseInfo))
+ {
+ // if there is only one word and we are able to initialize
+ // the phraseInfo then it is a success
+ if (numwords == 1)
+ found = 1;
+ while (!found)
+ {
+ retval = check_and_advance(0, phraseInfo);
+ if (retval == 2)
+ break;
+ else if (retval == 1)
+ {
+ for (i = 1; i < numwords - 1; i++)
+ {
+ retval = check_and_advance(i, phraseInfo);
+ if (retval == 2 || retval == 0)
+ break;
+ }
+ if (i >= numwords - 1)
+ // found a phrase
+ found = 1;
+ else if (retval == 2)
+ // no chance of finding a phrase
+ break;
+ }
+ // now move the pos of 0th phrase
+ if (phraseInfo[0].curpos >= phraseInfo[0].posVector->npos- 1)
+ // no chance of finding a phrase
+ break;
+ else
+ phraseInfo[0].curpos += 1;
+ }
+ }
+ pfree(phraseInfo);
+ pfree(prs.words);
+ // prepare to leave
+ PG_FREE_IF_COPY(phrase, 0);
+ PG_FREE_IF_COPY(t, 1);
+ // return
+ PG_RETURN_INT32(found);
+}
I have attached a patch for phrase search with respect to the cvs head.
Basically it takes a a phrase (text) and a TSVector. It checks if the
relative positions of lexeme in the phrase are same as in their
positions in TSVector.
Ideally, phrase search should be implemented as new operator in tsquery, say #
with optional distance. So, tsquery 'foo #2 bar' means: find all texts where
'bar' is place no far than two word from 'foo'. The complexity is about complex
boolean expressions ( 'foo #1 ( bar1 & bar2 )' ) and about several languages as
norwegian or german. German language has combining words, like a footboolbar -
and they have several variants of splitting, so result of to_tsquery('foo #
footboolbar') will be a 'foo # ( ( football & bar ) | ( foot & ball & bar ) )'
where variants are connected with OR operation.
Of course, phrase search should be able to use indexes.
If the configuration for text search is "simple", then this will produce
exact phrase search. Otherwise the stopwords in a phrase will be ignored
and the words in a phrase will only be matched with the stemmed lexeme.
Your solution can't be used as is, because user should use tsquery too to use an
index:
column @@ to_tsquery('phrase search') AND is_phrase_present('phrase search',
column)
First clause will be used for index scan and it will fast search a candidates.
For my application I am using this as a separate shared object. I do not
know how to expose this function from the core. Can someone explain how
to do this?
Look at contrib/ directory in pgsql's source code - make a contrib module from
your patch. As an example, look at adminpack module - it's rather simple.
Comments of your code:
1)
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
That isn't needed for compiled-in in core files, it's only needed for modules.
2)
use only /**/ comments, do not use a // (C++ style) comments
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
On Mon, 2008-06-02 at 19:39 +0400, Teodor Sigaev wrote:
I have attached a patch for phrase search with respect to the cvs head.
Basically it takes a a phrase (text) and a TSVector. It checks if the
relative positions of lexeme in the phrase are same as in their
positions in TSVector.Ideally, phrase search should be implemented as new operator in tsquery, say #
with optional distance. So, tsquery 'foo #2 bar' means: find all texts where
'bar' is place no far than two word from 'foo'. The complexity is about complex
boolean expressions ( 'foo #1 ( bar1 & bar2 )' ) and about several languages as
norwegian or german. German language has combining words, like a footboolbar -
and they have several variants of splitting, so result of to_tsquery('foo #
footboolbar') will be a 'foo # ( ( football & bar ) | ( foot & ball & bar ) )'
where variants are connected with OR operation.
This is far more complicated than I thought.
Of course, phrase search should be able to use indexes.
I can probably look into how to use index. Any pointers on this?
If the configuration for text search is "simple", then this will produce
exact phrase search. Otherwise the stopwords in a phrase will be ignored
and the words in a phrase will only be matched with the stemmed lexeme.Your solution can't be used as is, because user should use tsquery too to use an
index:column @@ to_tsquery('phrase search') AND is_phrase_present('phrase search',
column)First clause will be used for index scan and it will fast search a candidates.
Yes this is exactly how I am using in my application. Do you think this
will solve a lot of common case or we should try to get phrase search
1. Use index
2. Support arbitrary distance between lexemes
3. Support complex boolean queries
-Sushant.
Show quoted text
For my application I am using this as a separate shared object. I do not
know how to expose this function from the core. Can someone explain how
to do this?Look at contrib/ directory in pgsql's source code - make a contrib module from
your patch. As an example, look at adminpack module - it's rather simple.Comments of your code: 1) +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endifThat isn't needed for compiled-in in core files, it's only needed for modules.
2)
use only /**/ comments, do not use a // (C++ style) comments
This is far more complicated than I thought.
Of course, phrase search should be able to use indexes.
I can probably look into how to use index. Any pointers on this?
src/backend/utils/adt/tsginidx.c, if you invent operation # in tsquery then you
will have index support with minimal effort.
Yes this is exactly how I am using in my application. Do you think this
will solve a lot of common case or we should try to get phrase search
Yeah, it solves a lot of useful case, for simple use it's needed to invent
function similar to existsing plaitnto_tsquery, say phraseto_tsquery. It should
produce correct tsquery with described above operations.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
On Tue, 2008-06-03 at 22:16 +0400, Teodor Sigaev wrote:
This is far more complicated than I thought.
Of course, phrase search should be able to use indexes.
I can probably look into how to use index. Any pointers on this?
src/backend/utils/adt/tsginidx.c, if you invent operation # in tsquery then you
will have index support with minimal effort.Yes this is exactly how I am using in my application. Do you think this
will solve a lot of common case or we should try to get phrase searchYeah, it solves a lot of useful case, for simple use it's needed to invent
function similar to existsing plaitnto_tsquery, say phraseto_tsquery. It should
produce correct tsquery with described above operations.
I can add index support and support for arbitrary distance between
lexeme.
It appears to me that supporting arbitrary boolean expression will be
complicated. Can we pull out something from TSQuery?
-Sushant.
I can add index support and support for arbitrary distance between
lexeme.
It appears to me that supporting arbitrary boolean expression will be
complicated. Can we pull out something from TSQuery?
I don't very like an idea to have separated interface for phrase search. Your
patch may be a module and used by people who really wants to have a phrase search.
Introducing new operator in tsquery allows to use already existing
infrastructure of tsquery such as concatenations (&&, ||, !!), rewrite subsystem
etc. But new operation/types specially designed for phrase search makes needing
to make that work again.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
I looked at query operators for tsquery and here are some of the new
query operators for position based queries. I am just proposing some
changes and the questions I have.
1. What is the meaning of such a query operator?
foo #5 bar -> true if the document has word "foo" followed by "bar" at
5th position.
foo #<5 bar -> true if document has word "foo" followed by "bar" with in
5 positions
foo #>5 bar -> true if document has word "foo" followed by "bar" after 5
positions
then some other ways it can be used are
!(foo #<5 bar) -> true if document never has any "foo" followed by bar
with in 5 positions.
etc .....
2. How to implement such query operators?
Should we modify QueryItem to include additional distance information or
is there any other way to accomplish it?
Is the following list sufficient to accomplish this?
a. Modify to_tsquery
b. Modify TS_execute in tsvector_op.c to check new operator
Is there anything needed in rewrite subsystem?
3. Are these valid uses of the operators and if yes what would they
mean?
foo #5 (bar & cup)
If no then should the operator be applied to only two QI_VAL's?
4. If the operator only applies to two query items can we create an
index such that (foo, bar)-> documents[min distance, max distance]
How difficult it is to implement an index like this?
Thanks,
-Sushant.
Show quoted text
On Thu, 2008-06-05 at 19:37 +0400, Teodor Sigaev wrote:
I can add index support and support for arbitrary distance between
lexeme.
It appears to me that supporting arbitrary boolean expression will be
complicated. Can we pull out something from TSQuery?I don't very like an idea to have separated interface for phrase search. Your
patch may be a module and used by people who really wants to have a phrase search.Introducing new operator in tsquery allows to use already existing
infrastructure of tsquery such as concatenations (&&, ||, !!), rewrite subsystem
etc. But new operation/types specially designed for phrase search makes needing
to make that work again.
Sushant,
the problem of phrase search not in implementation, but in the theoretical
basis. tsearch is query rich and phrase search should support all query
operations, so we need algebra for query operations. We need more time
to investigate this problem, but just have no spare time for this.
If you are interesting, you might think in this direction.
Oleg
On Sat, 19 Jul 2008, Sushant Sinha wrote:
I looked at query operators for tsquery and here are some of the new
query operators for position based queries. I am just proposing some
changes and the questions I have.1. What is the meaning of such a query operator?
foo #5 bar -> true if the document has word "foo" followed by "bar" at
5th position.foo #<5 bar -> true if document has word "foo" followed by "bar" with in
5 positionsfoo #>5 bar -> true if document has word "foo" followed by "bar" after 5
positionsthen some other ways it can be used are
!(foo #<5 bar) -> true if document never has any "foo" followed by bar
with in 5 positions.etc .....
2. How to implement such query operators?
Should we modify QueryItem to include additional distance information or
is there any other way to accomplish it?Is the following list sufficient to accomplish this?
a. Modify to_tsquery
b. Modify TS_execute in tsvector_op.c to check new operatorIs there anything needed in rewrite subsystem?
3. Are these valid uses of the operators and if yes what would they
mean?foo #5 (bar & cup)
If no then should the operator be applied to only two QI_VAL's?
4. If the operator only applies to two query items can we create an
index such that (foo, bar)-> documents[min distance, max distance]
How difficult it is to implement an index like this?Thanks,
-Sushant.On Thu, 2008-06-05 at 19:37 +0400, Teodor Sigaev wrote:
I can add index support and support for arbitrary distance between
lexeme.
It appears to me that supporting arbitrary boolean expression will be
complicated. Can we pull out something from TSQuery?I don't very like an idea to have separated interface for phrase search. Your
patch may be a module and used by people who really wants to have a phrase search.Introducing new operator in tsquery allows to use already existing
infrastructure of tsquery such as concatenations (&&, ||, !!), rewrite subsystem
etc. But new operation/types specially designed for phrase search makes needing
to make that work again.
Regards,
Oleg
_____________________________________________________________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83
1. What is the meaning of such a query operator?
foo #5 bar -> true if the document has word "foo" followed by "bar" at
5th position.foo #<5 bar -> true if document has word "foo" followed by "bar" with in
5 positionsfoo #>5 bar -> true if document has word "foo" followed by "bar" after 5
positions
Sounds good, but, may be it's an overkill.
etc .....
2. How to implement such query operators?
Should we modify QueryItem to include additional distance information or
is there any other way to accomplish it?Is the following list sufficient to accomplish this?
a. Modify to_tsquery
b. Modify TS_execute in tsvector_op.c to check new operator
Exactly
Is there anything needed in rewrite subsystem?
Yes, of course - rewrite system should support that operation.
3. Are these valid uses of the operators and if yes what would they
mean?foo #5 (bar & cup)
It must support! Because of lexize might return subtsquery. For example,
russian ispell can return several lexemes: "adfg" can become a 'adf | adfs |
ad', norwegian and german languages are more complicated: "abc" -> " (ab & c) |
(a & bc) | abc"
4. If the operator only applies to two query items can we create an
index such that (foo, bar)-> documents[min distance, max distance]
How difficult it is to implement an index like this?
No, index should execute query 'foo & bar' and mark recheck flag to true to
execute 'foo #<5 bar' on original tsvector from table.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/