faster ts_headline

Started by Marcin Mańkabout 13 years ago1 messages

marcin.mank@gmail.com

about 13 years ago

2 attachment(s)

Hello,
I've started implementing a system for faster headline generation. WIP
patch is attached.

The idea is to make a new type currently called hltext (different
names welcome), that stores the text along with the lexization result.
It conceptually stores an array of tuples like
(word text, type int, lexemes text[] )

A console log is also attached - it shows 5x preformance increase. The
problem is not academic, I have such long texts in an app, making 20
headlines takes 3s+.

The patch lacks documentation, regression tests, and most auxillary
functions (especially I/O functions).

I have a question about the I/O functions of the new type. What format
to choose?

I could make the input function read something like 'english: the
text' where english is the name of the text search configuration . The
input function would do the lexizing.

I could make it read some custom format, which would contain the
tokens, token types and lexemes. Can I use flex/bison, or is there a
good reason not to, and I should make it a hand-made parser?

finally, I could make the type actually "create type
hltex_element(word text, type int, lexemes text[] )", by manually
filling in the applicable catalogs, and make the user make columns as
hltext_element[]. Is there a nice way to manipulate objects of such a
type from within the backend? Is there an example? I suppose that in
this case storage would not be as efficient as I made it.

which one to choose? Other ideas?

Regards
Marcin Mańk

Attachments:

log.txttext/plain; charset=US-ASCII; name=log.txtDownload

hltext.patchapplication/octet-stream; name=hltext.patchDownload

diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
new file mode 100644
index 7a4fa93..a280d33
*** a/src/backend/tsearch/to_tsany.c
--- b/src/backend/tsearch/to_tsany.c
*************** plainto_tsquery(PG_FUNCTION_ARGS)
*** 432,434 ****
--- 432,460 ----
  										ObjectIdGetDatum(cfgId),
  										PointerGetDatum(in)));
  }
+ 
+ /*
+  * hltext
+  */
+ Datum
+ to_hltext_byid(PG_FUNCTION_ARGS)
+ {
+  	Oid			cfgId = PG_GETARG_OID(0);
+  	text		*in = PG_GETARG_TEXT_P(1);
+ 	HLText 		out = make_hltext(cfgId, VARDATA(in), VARSIZE(in) - VARHDRSZ);
+ 	
+ 
+ 	PG_RETURN_POINTER(out);
+ }
+ 
+ Datum 
+ hltextin(PG_FUNCTION_ARGS)
+ {
+ 	elog(ERROR, "hltextin not implemented");
+ }
+ 
+ Datum 
+ hltextout(PG_FUNCTION_ARGS)
+ {
+ 	elog(ERROR, "hltextout not implemented");
+ }
\ No newline at end of file
diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c
new file mode 100644
index ae59c34..49e69e2
*** a/src/backend/tsearch/ts_parse.c
--- b/src/backend/tsearch/ts_parse.c
***************
*** 16,21 ****
--- 16,22 ----
  
  #include "tsearch/ts_cache.h"
  #include "tsearch/ts_utils.h"
+ #include <assert.h>
  
  #define IGNORE_LONGLEXEME	1
  
*************** hlparsetext(Oid cfgId, HeadlineParsedTex
*** 554,590 ****
  
  		if (type > 0 && lenlemm >= MAXSTRLEN)
  		{
! #ifdef IGNORE_LONGLEXEME
! 			ereport(NOTICE,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("word is too long to be indexed"),
! 					 errdetail("Words longer than %d characters are ignored.",
! 							   MAXSTRLEN)));
  			continue;
- #else
- 			ereport(ERROR,
- 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- 					 errmsg("word is too long to be indexed"),
- 					 errdetail("Words longer than %d characters are ignored.",
- 							   MAXSTRLEN)));
- #endif
  		}
  
  		LexizeAddLemm(&ldata, type, lemm, lenlemm);
  
  		do
  		{
! 			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
! 				addHLParsedLex(prs, query, lexs, norms);
! 			else
! 				addHLParsedLex(prs, query, lexs, NULL);
  		} while (norms);
  
  	} while (type > 0);
  
  	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
  }
  
  text *
  generateHeadline(HeadlineParsedText *prs)
  {
--- 555,730 ----
  
  		if (type > 0 && lenlemm >= MAXSTRLEN)
  		{
! 			/* word is too long for the lexer */
! 			hladdword(prs, lemm, lenlemm, type);
  			continue;
  		}
  
  		LexizeAddLemm(&ldata, type, lemm, lenlemm);
  
  		do
  		{
! 			norms = LexizeExec(&ldata, &lexs);
! 			addHLParsedLex(prs, query, lexs, norms);
! 		} while (norms);
! 
! 	} while (type > 0);
! 
! 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
! }
! 
! static void append_word(char **words, int *word_pos, int *buffer_size, char *word, int word_size)
! {
! 	while(*word_pos + word_size + 1 > *buffer_size){
! 		*buffer_size *= 2;
! 		*words = repalloc(*words, *buffer_size);
! 	}
! 	memcpy(*words + *word_pos, word, word_size);
! 	(*words)[(*word_pos) + word_size] = '\0';
! //	elog(INFO, "append word:'%s' sz:%d pos:%d", *words + *word_pos, word_size, *word_pos);
! 	*word_pos += word_size + 1;
! }
! 
! 
! void
! hlparse_hltext(HeadlineParsedText *prs, TSQuery query, HLText hlt)
! {
! 	int i;
! 	
! 	char *words = (char*)(hlt->entries + hlt->nentries); 
! 	
! //	elog(INFO, "FIRSTWORD:%s nentries:%d", words, hlt->nentries);
! 	for(i=0; i<hlt->nentries; i++){
! 		int len = strlen(words);
! 		int j;
! 		hladdword(prs, words, len, hlt->entries[i].type);
! //		elog(INFO, "WORD:%s nnorms:%d", words, hlt->entries[i].nnorms);
! 		words += len+1;
! 		
! 		for(j=0; j<hlt->entries[i].nnorms; ++j){
! 			int len = strlen(words);
! 			hlfinditem(prs, query, words, len);
! 			words += len+1;
! 		}
! 	}
! 	if(words != (char *)hlt + VARSIZE(hlt))
! 		elog(ERROR, "corrupted hltext");
! }
! 
! 
! HLText make_hltext(Oid cfgId, char *buf, int buflen)
! {
! 	int			type,
! 				lenlemm;
! 	char	   *lemm = NULL;
! 	LexizeData	ldata;
! 	TSLexeme   *norms;
! 	ParsedLex  *lexs;
! 	TSConfigCacheEntry *cfg;
! 	TSParserCacheEntry *prsobj;
! 	void	   *prsdata;
! 	
! 	HLText ret = palloc0(sizeof(HLTextData));
! 	int entries_sz = 1;
! 	int words_sz = 64;
! 	char *words = palloc(words_sz);
! 	int word_pos = 0;
! 	int total_size;
! 	
! 	cfg = lookup_ts_config_cache(cfgId);
! 	prsobj = lookup_ts_parser_cache(cfg->prsId);
! 
! 	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
! 													 PointerGetDatum(buf),
! 													 Int32GetDatum(buflen)));
! 
! 	LexizeInit(&ldata, cfg);
! 	
! //	ptrdiff_t off = offsetof(HLTextData, entries);
! 	do
! 	{
! 		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
! 										   PointerGetDatum(prsdata),
! 										   PointerGetDatum(&lemm),
! 										   PointerGetDatum(&lenlemm)));
! 
! 		if (type > 0 && lenlemm >= MAXSTRLEN)
! 		{
! 			append_word(&words, &word_pos, &words_sz, lemm, lenlemm);
! 			while(ret->nentries+1 >= entries_sz){
! 				entries_sz *= 2;
! 				ret = repalloc(ret, sizeof(HLTextData) + sizeof(HLTextEntry) * (entries_sz-1));
! 			}
! 			ret->nentries++;
! 			ret->entries[ret->nentries-1].type = type;
! 			ret->entries[ret->nentries-1].nnorms = 0;
! 			continue;			
! 		}
! 		
! 		LexizeAddLemm(&ldata, type, lemm, lenlemm);
! 
! 		do
! 		{
! 			ParsedLex  *tmplexs;
! 			TSLexeme   *ptr;
! 
! 			norms = LexizeExec(&ldata, &lexs);
! 			while (lexs)
! 			{
! 
! 				if (lexs->type > 0)
! 				{
! 					append_word(&words, &word_pos, &words_sz, lexs->lemm, lexs->lenlemm);
! 					while(ret->nentries+1 >= entries_sz){
! 						entries_sz *= 2;
! 						ret = repalloc(ret, sizeof(HLTextData) + sizeof(HLTextEntry) * (entries_sz-1));
! 					}
! 					ret->nentries++;
! 					ret->entries[ret->nentries-1].type = lexs->type;
! 					ret->entries[ret->nentries-1].nnorms = 0;
! 				}
! 				
! 				ptr = norms;
! 				while (ptr && ptr->lexeme)
! 				{
! 					(ret->entries[ret->nentries-1].nnorms)++;
! 					append_word(&words, &word_pos, &words_sz, ptr->lexeme, strlen(ptr->lexeme));
! 					ptr++;
! 				}
! 
! 				tmplexs = lexs->next;
! 				pfree(lexs);
! 				lexs = tmplexs;
! 			}
! 
! 			if (norms)
! 			{
! 				ptr = norms;
! 				while (ptr->lexeme)
! 				{
! 					pfree(ptr->lexeme);
! 					ptr++;
! 				}
! 				pfree(norms);
! 			}
  		} while (norms);
  
  	} while (type > 0);
  
  	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+ 	
+ 	total_size = offsetof(HLTextData, entries) + sizeof(HLTextEntry) * (ret->nentries) + word_pos;
+ 	ret = repalloc(ret, total_size);
+ 	memcpy(ret->entries + ret->nentries, words, word_pos);
+ 	pfree(words);
+ //	elog(INFO, "total_size:%d word_pos:%d nentries:%d htd:%d", total_size, word_pos, ret->nentries, sizeof(HLTextData));
+ 	
+ 	SET_VARSIZE(ret, total_size);
+ 	
+ 	return ret;	
  }
  
+ 
  text *
  generateHeadline(HeadlineParsedText *prs)
  {
diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c
new file mode 100644
index f45eeb9..63c5957
*** a/src/backend/tsearch/wparser.c
--- b/src/backend/tsearch/wparser.c
*************** ts_headline_opt(PG_FUNCTION_ARGS)
*** 361,363 ****
--- 361,416 ----
  										PG_GETARG_DATUM(1),
  										PG_GETARG_DATUM(2)));
  }
+ 
+ 
+ 
+ Datum
+ ts_headline_byid_opt_hl(PG_FUNCTION_ARGS)
+ {
+ 	HLText	   *in = PG_DETOAST_DATUM(PG_GETARG_POINTER(1));
+ 	TSQuery		query = PG_GETARG_TSQUERY(2);
+ 	text	   *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
+ 	HeadlineParsedText prs;
+ 	List	   *prsoptions;
+ 	text	   *out;
+ 	TSConfigCacheEntry *cfg;
+ 	TSParserCacheEntry *prsobj;
+ 	HLText hlt;
+ 
+ 	cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
+ 	prsobj = lookup_ts_parser_cache(cfg->prsId);
+ 
+ 	if (!OidIsValid(prsobj->headlineOid))
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ 		   errmsg("text search parser does not support headline creation")));
+ 
+ 	memset(&prs, 0, sizeof(HeadlineParsedText));
+ 	prs.lenwords = 32;
+ 	prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords);
+ 
+ 	hlparse_hltext(&prs, query, in);
+ 	
+ 
+ 	if (opt)
+ 		prsoptions = deserialize_deflist(PointerGetDatum(opt));
+ 	else
+ 		prsoptions = NIL;
+ 
+ 	FunctionCall3(&(prsobj->prsheadline),
+ 				  PointerGetDatum(&prs),
+ 				  PointerGetDatum(prsoptions),
+ 				  PointerGetDatum(query));
+ 
+ 	out = generateHeadline(&prs);
+ 
+ 	PG_FREE_IF_COPY(in, 1);
+ 	PG_FREE_IF_COPY(query, 2);
+ 	if (opt)
+ 		PG_FREE_IF_COPY(opt, 3);
+ 	pfree(prs.words);
+ 	pfree(prs.startsel);
+ 	pfree(prs.stopsel);
+ 
+ 	PG_RETURN_POINTER(out);
+ }
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
new file mode 100644
index f935eb1..edaed71
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
*************** DATA(insert OID =  3646 (  gtsvectorin		
*** 4179,4184 ****
--- 4179,4188 ----
  DESCR("I/O");
  DATA(insert OID =  3647 (  gtsvectorout			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2275 "3642" _null_ _null_ _null_ _null_ gtsvectorout _null_ _null_ _null_ ));
  DESCR("I/O");
+ DATA(insert OID =  3780 (  hltextin			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 3779 "2275" _null_ _null_ _null_ _null_ hltextin _null_ _null_ _null_ ));
+ DESCR("I/O");
+ DATA(insert OID =  3781 (  hltextout			PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 2275 "3779" _null_ _null_ _null_ _null_ hltextout _null_ _null_ _null_ ));
+ DESCR("I/O");
  
  DATA(insert OID = 3616 (  tsvector_lt			PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "3614 3614" _null_ _null_ _null_ _null_ tsvector_lt _null_ _null_ _null_ ));
  DATA(insert OID = 3617 (  tsvector_le			PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "3614 3614" _null_ _null_ _null_ _null_ tsvector_le _null_ _null_ _null_ ));
*************** DESCR("generate headline");
*** 4356,4363 ****
--- 4360,4373 ----
  DATA(insert OID = 3755 (  ts_headline	PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 25 "25 3615" _null_ _null_ _null_ _null_ ts_headline _null_ _null_ _null_ ));
  DESCR("generate headline");
  
+ DATA(insert OID = 3783 (  ts_headline	PGNSP PGUID 12 1 0 0 0 f f f f t f i 4 0 25 "3734 3779 3615 25" _null_ _null_ _null_ _null_ ts_headline_byid_opt_hl _null_ _null_ _null_ ));
+ DESCR("generate headline");
+ 
+ 
  DATA(insert OID = 3745 (  to_tsvector		PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 3614 "3734 25" _null_ _null_ _null_ _null_ to_tsvector_byid _null_ _null_ _null_ ));
  DESCR("transform to tsvector");
+ DATA(insert OID = 3782 (  to_hltext		PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 3779 "3734 25" _null_ _null_ _null_ _null_ to_hltext_byid _null_ _null_ _null_ ));
+ DESCR("transform to hltext");
  DATA(insert OID = 3746 (  to_tsquery		PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ to_tsquery_byid _null_ _null_ _null_ ));
  DESCR("make tsquery");
  DATA(insert OID = 3747 (  plainto_tsquery	PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ plainto_tsquery_byid _null_ _null_ _null_ ));
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
new file mode 100644
index d0fe7c5..a0f5848
*** a/src/include/catalog/pg_type.h
--- b/src/include/catalog/pg_type.h
*************** DESCR("registered text search configurat
*** 592,597 ****
--- 592,599 ----
  DATA(insert OID = 3769 ( regdictionary	PGNSP PGUID 4 t b N f t \054 0 0 3770 regdictionaryin regdictionaryout regdictionaryrecv regdictionarysend - - - i p f 0 -1 0 0 _null_ _null_ _null_ ));
  DESCR("registered text search dictionary");
  #define REGDICTIONARYOID	3769
+ DATA(insert OID = 3779 ( hltext		PGNSP PGUID -1 f b U f t \054 0 0 0 hltextin hltextout - - - - - i x f 0 -1 0 0 _null_ _null_ _null_ ));
+ DESCR("type for faster headline calculation");
  
  DATA(insert OID = 3643 ( _tsvector		PGNSP PGUID -1 f b A f t \054 0 3614 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
  DATA(insert OID = 3644 ( _gtsvector		PGNSP PGUID -1 f b A f t \054 0 3642 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
new file mode 100644
index d80f7ad..e88e0cc
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
*************** extern Datum tsquery_rewrite_query(PG_FU
*** 296,299 ****
--- 296,328 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
  
+ 
+ /* HLText */
+ 
+ typedef struct{
+ 	uint8 type;
+ 	uint8 nnorms;	
+ }HLTextEntry;
+ 
+ 
+ typedef struct
+ {
+ 	char		vl_len_[4];		/* varlena header (do not touch directly!) */
+ 	int32		nentries;
+ 	HLTextEntry	entries[1];		/* data starts here */
+ } HLTextData;
+ 
+ typedef HLTextData *HLText;
+ 
+ /*
+  * I/O
+  */
+ extern Datum hltextin(PG_FUNCTION_ARGS);
+ extern Datum hltextout(PG_FUNCTION_ARGS);
+ 
+ 
+ 
+ extern Datum to_hltext_byid(PG_FUNCTION_ARGS);
+ extern Datum ts_headline_byid_opt_hl(PG_FUNCTION_ARGS);
+ 
  #endif   /* _PG_TSTYPE_H_ */
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
new file mode 100644
index d3088fe..b334aeb
*** a/src/include/tsearch/ts_utils.h
--- b/src/include/tsearch/ts_utils.h
*************** extern void hlparsetext(Oid cfgId, Headl
*** 101,106 ****
--- 101,109 ----
  			char *buf, int32 buflen);
  extern text *generateHeadline(HeadlineParsedText *prs);
  
+ 
+ extern void hlparse_hltext(HeadlineParsedText *prs, TSQuery query, HLText hlt);
+ extern HLText make_hltext(Oid cfgId, char *buf, int buflen);
  /*
   * Common check function for tsvector @@ tsquery
   */