Request for review: tsearch2 patch

Started by Tatsuo Ishiiabout 19 years ago9 messages
#1Tatsuo Ishii
ishii@postgresql.org
1 attachment(s)

Hi,

Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Attachments:

tsearch2.patchtext/plain; charset=us-asciiDownload
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c	20 Nov 2006 14:03:30 -0000	1.7
--- ts_locale.c	1 Jan 2007 12:22:50 -0000
***************
*** 63,68 ****
--- 63,101 ----
  
  	return mbstowcs(to, from, len);
  }
+ 
+ #else	/* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ 	wchar_t *result;
+ 	size_t n;
+ 
+ 	if (to == NULL)
+ 		return 0;
+ 
+ 	if (lc_ctype_is_c)
+ 	{
+ 		/* allocate neccesary memory for "to" including NULL terminate */
+ 		result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+ 		/* do the conversion */
+ 		n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ 		if (n > 0)
+ 		{
+ 			/* store the result */
+ 			if (n > len)
+ 				n = len;
+ 			memcpy(to, result, n*sizeof(wchar_t));
+ 			pfree(result);
+ 			*(to + n) = '\0';
+ 		}
+ 		return n;
+ 	}
+ 	return mbstowcs(to, from, len);
+ }
+ 
  #endif   /* WIN32 */
  
  int
***************
*** 70,75 ****
--- 103,113 ----
  {
  	wchar_t		character;
  
+ 	if (lc_ctype_is_c)
+ 	{
+ 		return isalpha(TOUCHAR(ptr));
+ 	}
+ 
  	char2wchar(&character, ptr, 1);
  
  	return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
  {
  	wchar_t		character;
  
+ 	if (lc_ctype_is_c)
+ 	{
+ 		return isprint(TOUCHAR(ptr));
+ 	}
+ 
  	char2wchar(&character, ptr, 1);
  
  	return iswprint((wint_t) character);
***************
*** 126,132 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
--- 169,175 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
***************
*** 152,158 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
--- 195,201 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h	4 Oct 2006 00:29:47 -0000	1.7
--- ts_locale.h	1 Jan 2007 12:22:50 -0000
***************
*** 38,45 ****
  #else							/* WIN32 */
  
  /* correct mbstowcs */
- #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
  #else							/* WIN32 */
  
  /* correct mbstowcs */
  #define wchar2char wcstombs
+ size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+ 
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
   * t_iseq() should be called only for ASCII symbols
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
  
  #define COPYCHAR(d,s)	do {				\
  	int lll = pg_mblen( s );			\
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c	4 Oct 2006 00:29:47 -0000	1.11
--- wordparser/parser.c	1 Jan 2007 12:22:51 -0000
***************
*** 44,52 ****
  	 * Some operating systems fail with multi-byte encodings and a C locale.
  	 * Also, for a C locale there is no need to process as multibyte. From
  	 * backend/utils/adt/oracle_compat.c Teodor
  	 */
  
! 	if (prs->charmaxlen > 1 && !lc_ctype_is_c())
  	{
  		prs->usewide = true;
  		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
  	 * Some operating systems fail with multi-byte encodings and a C locale.
  	 * Also, for a C locale there is no need to process as multibyte. From
  	 * backend/utils/adt/oracle_compat.c Teodor
+ 	 *
+ 	 * This is wrong assumption. even if locale is C, multibyte is necceary.
  	 */
  
! 	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
  		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
  static int											\
  p_is##type(TParser *prs) {									\
  	Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
  		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
  }	\
  												\
--- 94,102 ----
  static int											\
  p_is##type(TParser *prs) {									\
  	Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? \
! 			 (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! 			  isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
  		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
  }	\
  												\
***************
*** 134,141 ****
  }
  #endif   /* TS_USE_WIDE */
  
! p_iswhat(alnum)
! p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 138,197 ----
  }
  #endif   /* TS_USE_WIDE */
  
! static int p_isalnum(TParser *prs) {
! 	Assert( prs->state );
! 
! 	if (prs->usewide)
! 	{
! 		unsigned int c;
! 
! 		c = *(prs->wstr + prs->state->poschar);
! 
! 		if (lc_ctype_is_c)
! 		{
! 			if (c > 0x7f)
! 				return 1;
! 			return isalnum(0xff & c);
! 		}
! 		else
! 			return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! 	}
! 	else
! 		return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int	p_isnotalnum(TParser *prs)
! {
! 	return !p_isalnum(prs);
! }
! 
! static int p_isalpha(TParser *prs) {
! 	Assert( prs->state );
! 
! 	if (prs->usewide)
! 	{
! 		unsigned int c;
! 
! 		c = *(prs->wstr + prs->state->poschar);
! 
! 		if (lc_ctype_is_c)
! 		{
! 			if (c > 0x7f)
! 				return 1;
! 			return isalpha(0xff & c);
! 		}
! 		else
! 			return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! 	}
! 	else
! 		return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int	p_isnotalpha(TParser *prs)
! {
! 	return !p_isalpha(prs);
! }
! 
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
#2Tatsuo Ishii
ishii@postgresql.org
In reply to: Tatsuo Ishii (#1)
1 attachment(s)
Re: Request for review: tsearch2 patch

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Show quoted text

Hi,

Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Attachments:

tsearch.patchtext/plain; charset=us-asciiDownload
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c	20 Nov 2006 14:03:30 -0000	1.7
--- ts_locale.c	4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----
  
  	return mbstowcs(to, from, len);
  }
+ 
+ #else	/* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ 	wchar_t *result;
+ 	size_t n;
+ 
+ 	if (to == NULL)
+ 		return 0;
+ 
+ 	if (lc_ctype_is_c())
+ 	{
+ 		/* allocate neccesary memory for "to" including NULL terminate */
+ 		result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+ 		/* do the conversion */
+ 		n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ 		if (n > 0)
+ 		{
+ 			/* store the result */
+ 			if (n > len)
+ 				n = len;
+ 			memcpy(to, result, n*sizeof(wchar_t));
+ 			pfree(result);
+ 			*(to + n) = '\0';
+ 		}
+ 		return n;
+ 	}
+ 	return mbstowcs(to, from, len);
+ }
+ 
  #endif   /* WIN32 */
  
  int
***************
*** 70,75 ****
--- 103,113 ----
  {
  	wchar_t		character;
  
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isalpha(TOUCHAR(ptr));
+ 	}
+ 
  	char2wchar(&character, ptr, 1);
  
  	return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
  {
  	wchar_t		character;
  
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isprint(TOUCHAR(ptr));
+ 	}
+ 
  	char2wchar(&character, ptr, 1);
  
  	return iswprint((wint_t) character);
***************
*** 126,132 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
--- 169,175 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
***************
*** 152,158 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
--- 195,201 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h	4 Oct 2006 00:29:47 -0000	1.7
--- ts_locale.h	4 Jan 2007 12:16:00 -0000
***************
*** 38,45 ****
  #else							/* WIN32 */
  
  /* correct mbstowcs */
- #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
  #else							/* WIN32 */
  
  /* correct mbstowcs */
  #define wchar2char wcstombs
+ size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+ 
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
   * t_iseq() should be called only for ASCII symbols
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
  
  #define COPYCHAR(d,s)	do {				\
  	int lll = pg_mblen( s );			\
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c	4 Oct 2006 00:29:47 -0000	1.11
--- wordparser/parser.c	4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****
  	 * Some operating systems fail with multi-byte encodings and a C locale.
  	 * Also, for a C locale there is no need to process as multibyte. From
  	 * backend/utils/adt/oracle_compat.c Teodor
  	 */
  
! 	if (prs->charmaxlen > 1 && !lc_ctype_is_c())
  	{
  		prs->usewide = true;
  		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
  	 * Some operating systems fail with multi-byte encodings and a C locale.
  	 * Also, for a C locale there is no need to process as multibyte. From
  	 * backend/utils/adt/oracle_compat.c Teodor
+ 	 *
+ 	 * This is wrong assumption. even if locale is C, multibyte is necceary.
  	 */
  
! 	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
  		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
  static int											\
  p_is##type(TParser *prs) {									\
  	Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
  		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
  }	\
  												\
--- 94,102 ----
  static int											\
  p_is##type(TParser *prs) {									\
  	Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? \
! 			 (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! 			  isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
  		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
  }	\
  												\
***************
*** 134,141 ****
  }
  #endif   /* TS_USE_WIDE */
  
! p_iswhat(alnum)
! p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 138,197 ----
  }
  #endif   /* TS_USE_WIDE */
  
! static int p_isalnum(TParser *prs) {
! 	Assert( prs->state );
! 
! 	if (prs->usewide)
! 	{
! 		unsigned int c;
! 
! 		c = *(prs->wstr + prs->state->poschar);
! 
! 		if (lc_ctype_is_c())
! 		{
! 			if (c > 0x7f)
! 				return 1;
! 			return isalnum(0xff & c);
! 		}
! 		else
! 			return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! 	}
! 	else
! 		return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int	p_isnotalnum(TParser *prs)
! {
! 	return !p_isalnum(prs);
! }
! 
! static int p_isalpha(TParser *prs) {
! 	Assert( prs->state );
! 
! 	if (prs->usewide)
! 	{
! 		unsigned int c;
! 
! 		c = *(prs->wstr + prs->state->poschar);
! 
! 		if (lc_ctype_is_c())
! 		{
! 			if (c > 0x7f)
! 				return 1;
! 			return isalpha(0xff & c);
! 		}
! 		else
! 			return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! 	}
! 	else
! 		return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
! 
! static int	p_isnotalpha(TParser *prs)
! {
! 	return !p_isalpha(prs);
! }
! 
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
#3Teodor Sigaev
teodor@sigaev.ru
In reply to: Tatsuo Ishii (#2)
Re: Request for review: tsearch2 patch

Sorry for delay, I was on holidays :)

Did you test patch on Windows platform?

Tatsuo Ishii wrote:

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Hi,

Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

------------------------------------------------------------------------

Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c	20 Nov 2006 14:03:30 -0000	1.7
--- ts_locale.c	4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----
return mbstowcs(to, from, len);
}
+ 
+ #else	/* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ 	wchar_t *result;
+ 	size_t n;
+ 
+ 	if (to == NULL)
+ 		return 0;
+ 
+ 	if (lc_ctype_is_c())
+ 	{
+ 		/* allocate neccesary memory for "to" including NULL terminate */
+ 		result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+ 		/* do the conversion */
+ 		n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ 		if (n > 0)
+ 		{
+ 			/* store the result */
+ 			if (n > len)
+ 				n = len;
+ 			memcpy(to, result, n*sizeof(wchar_t));
+ 			pfree(result);
+ 			*(to + n) = '\0';
+ 		}
+ 		return n;
+ 	}
+ 	return mbstowcs(to, from, len);
+ }
+ 
#endif   /* WIN32 */
int
***************
*** 70,75 ****
--- 103,113 ----
{
wchar_t		character;
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isalpha(TOUCHAR(ptr));
+ 	}
+ 
char2wchar(&character, ptr, 1);
return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
{
wchar_t		character;
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isprint(TOUCHAR(ptr));
+ 	}
+ 
char2wchar(&character, ptr, 1);

return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));

Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 195,201 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h	4 Oct 2006 00:29:47 -0000	1.7
--- ts_locale.h	4 Jan 2007 12:16:00 -0000
***************
*** 38,45 ****
#else							/* WIN32 */

/* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */

#define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
#else							/* WIN32 */
/* correct mbstowcs */
#define wchar2char wcstombs
+ size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+ 
#endif   /* WIN32 */
#define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
#define COPYCHAR(d,s)	do {				\
int lll = pg_mblen( s );			\
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c	4 Oct 2006 00:29:47 -0000	1.11
--- wordparser/parser.c	4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
! 	if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
+ 	 *
+ 	 * This is wrong assumption. even if locale is C, multibyte is necceary.
*/
! 	if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
static int											\
p_is##type(TParser *prs) {									\
Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
}	\
\
--- 94,102 ----
static int											\
p_is##type(TParser *prs) {									\
Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? \
! 			 (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! 			  isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
}	\
\
***************
*** 134,141 ****
}
#endif   /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 138,197 ----
}
#endif   /* TS_USE_WIDE */

! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)

------------------------------------------------------------------------

---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
choose an index scan if your joining column's datatypes do not
match

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

#4Tatsuo Ishii
ishii@sraoss.co.jp
In reply to: Teodor Sigaev (#3)
Re: Request for review: tsearch2 patch

Sorry for delay, I was on holidays :)

Did you test patch on Windows platform?

No. I myself does not use Windows platform.

Do you have any concern on Windows regarding my patches?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Show quoted text

Tatsuo Ishii wrote:

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Hi,

Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

------------------------------------------------------------------------

Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c	20 Nov 2006 14:03:30 -0000	1.7
--- ts_locale.c	4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----
return mbstowcs(to, from, len);
}
+ 
+ #else	/* WIN32 */
+ 
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ 	wchar_t *result;
+ 	size_t n;
+ 
+ 	if (to == NULL)
+ 		return 0;
+ 
+ 	if (lc_ctype_is_c())
+ 	{
+ 		/* allocate neccesary memory for "to" including NULL terminate */
+ 		result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+ 
+ 		/* do the conversion */
+ 		n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ 		if (n > 0)
+ 		{
+ 			/* store the result */
+ 			if (n > len)
+ 				n = len;
+ 			memcpy(to, result, n*sizeof(wchar_t));
+ 			pfree(result);
+ 			*(to + n) = '\0';
+ 		}
+ 		return n;
+ 	}
+ 	return mbstowcs(to, from, len);
+ }
+ 
#endif   /* WIN32 */
int
***************
*** 70,75 ****
--- 103,113 ----
{
wchar_t		character;
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isalpha(TOUCHAR(ptr));
+ 	}
+ 
char2wchar(&character, ptr, 1);
return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
{
wchar_t		character;
+ 	if (lc_ctype_is_c())
+ 	{
+ 		return isprint(TOUCHAR(ptr));
+ 	}
+ 
char2wchar(&character, ptr, 1);

return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));

Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 195,201 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h	4 Oct 2006 00:29:47 -0000	1.7
--- ts_locale.h	4 Jan 2007 12:16:00 -0000
***************
*** 38,45 ****
#else							/* WIN32 */

/* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */

#define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
#else							/* WIN32 */
/* correct mbstowcs */
#define wchar2char wcstombs
+ size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+ 
#endif   /* WIN32 */
#define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
#define COPYCHAR(d,s)	do {				\
int lll = pg_mblen( s );			\
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c	4 Oct 2006 00:29:47 -0000	1.11
--- wordparser/parser.c	4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
! 	if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
+ 	 *
+ 	 * This is wrong assumption. even if locale is C, multibyte is necceary.
*/
! 	if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
static int											\
p_is##type(TParser *prs) {									\
Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
}	\
\
--- 94,102 ----
static int											\
p_is##type(TParser *prs) {									\
Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? \
! 			 (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! 			  isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
}	\
\
***************
*** 134,141 ****
}
#endif   /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 138,197 ----
}
#endif   /* TS_USE_WIDE */

! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)

------------------------------------------------------------------------

---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
choose an index scan if your joining column's datatypes do not
match

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

#5Teodor Sigaev
teodor@sigaev.ru
In reply to: Tatsuo Ishii (#2)
Re: Request for review: tsearch2 patch

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.

! static int p_isalnum(TParser *prs) {

...

! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;

I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

#6Tatsuo Ishii
ishii@sraoss.co.jp
In reply to: Teodor Sigaev (#5)
Re: Request for review: tsearch2 patch

From: Teodor Sigaev <teodor@sigaev.ru>
Subject: Re: [HACKERS] Request for review: tsearch2 patch
Date: Wed, 10 Jan 2007 18:50:44 +0300
Message-ID: <45A50B54.6090608@sigaev.ru>

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.

It seems it's a bug with original tsearch2. Here is the patches.

------------------------------------------------------------------
*** wordparser/parser.c~	2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c	2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****
  	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
--- 51,57 ----
  	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
------------------------------------------------------------------

! static int p_isalnum(TParser *prs) {

...

! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;

I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

#7Tatsuo Ishii
ishii@postgresql.org
In reply to: Teodor Sigaev (#5)
Re: Request for review: tsearch2 patch

I have tested with local-enabled environment and found a bug. Included
is the new version of patches.

Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.

It seems it's a bug with original tsearch2. Here is the patches.

------------------------------------------------------------------
*** wordparser/parser.c~	2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c	2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****
  	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
--- 51,57 ----
  	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
------------------------------------------------------------------

! static int p_isalnum(TParser *prs) {

...

! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;

I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan

#8Teodor Sigaev
teodor@sigaev.ru
In reply to: Tatsuo Ishii (#6)
1 attachment(s)
Re: Request for review: tsearch2 patch

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.

Ok, I see.

Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.

PS. Magnus, may I ask you to test under Windows? Thank you.

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/

Attachments:

tsearch2.patchtext/plain; name=tsearch2.patchDownload
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c	Fri Jan 12 10:53:11 2007
--- ./ts_locale.c	Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
  	if (GetDatabaseEncoding() == PG_UTF8)
  	{
  		int			r;
  
- 		if (len == 0)
- 			return 0;
- 
  		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
  								NULL, NULL);
  
--- 12,24 ----
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
+ 	if (len == 0)
+ 		return 0;
+ 
  	if (GetDatabaseEncoding() == PG_UTF8)
  	{
  		int			r;
  
  		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
  								NULL, NULL);
  
***************
*** 34,50 ****
  
  	return wcstombs(to, from, len);
  }
  
  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
  	if (GetDatabaseEncoding() == PG_UTF8)
  	{
  		int			r;
  
- 		if (len == 0)
- 			return 0;
- 
  		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
  
  		if (!r)
--- 34,52 ----
  
  	return wcstombs(to, from, len);
  }
+ #endif   /* WIN32 */
  
  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
+ 	if (len == 0)
+ 		return 0;
+ 
+ #ifdef WIN32
  	if (GetDatabaseEncoding() == PG_UTF8)
  	{
  		int			r;
  
  		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
  
  		if (!r)
***************
*** 60,88 ****
  
  		return r;
  	}
  
  	return mbstowcs(to, from, len);
  }
- #endif   /* WIN32 */
  
  int
  _t_isalpha(const char *ptr)
  {
! 	wchar_t		character;
  
! 	char2wchar(&character, ptr, 1);
  
! 	return iswalpha((wint_t) character);
  }
  
  int
  _t_isprint(const char *ptr)
  {
! 	wchar_t		character;
  
! 	char2wchar(&character, ptr, 1);
  
! 	return iswprint((wint_t) character);
  }
  #endif   /* TS_USE_WIDE */
  
--- 62,105 ----
  
  		return r;
  	}
+ 	else 
+ #endif /* WIN32 */
+ 	if ( lc_ctype_is_c() )
+ 	{
+ 		/*
+ 		 * pg_mb2wchar_with_len always adds trailing '\0', so 
+ 		 * 'to' should be allocated with sufficient space 
+ 		 */
+ 		return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ 	}
  
  	return mbstowcs(to, from, len);
  }
  
  int
  _t_isalpha(const char *ptr)
  {
! 	wchar_t		character[2];
! 
! 	if (lc_ctype_is_c())
! 		return isalpha(TOUCHAR(ptr));
  
! 	char2wchar(character, ptr, 1);
  
! 	return iswalpha((wint_t) *character);
  }
  
  int
  _t_isprint(const char *ptr)
  {
! 	wchar_t		character[2];
! 
! 	if (lc_ctype_is_c())
! 		return isprint(TOUCHAR(ptr));
  
! 	char2wchar(character, ptr, 1);
  
! 	return iswprint((wint_t) *character);
  }
  #endif   /* TS_USE_WIDE */
  
***************
*** 126,132 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
--- 143,149 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from server encoding to wchar_t")));
  
  		Assert(wlen<=len);
  		wstr[wlen] = 0;
***************
*** 152,158 ****
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
--- 169,175 ----
  		if ( wlen < 0 )
  			ereport(ERROR,
  					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
  		Assert(wlen<=len);
  		out[wlen]='\0';
  	}
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h	Fri Jan 12 10:53:11 2007
--- ./ts_locale.h	Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
  #define TOUCHAR(x)	(*((unsigned char*)(x)))
  
  #ifdef TS_USE_WIDE
  
  #ifdef WIN32
  
  size_t		wchar2char(char *to, const wchar_t *from, size_t len);
! size_t		char2wchar(wchar_t *to, const char *from, size_t len);
  #else							/* WIN32 */
  
! /* correct mbstowcs */
! #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
  #define TOUCHAR(x)	(*((unsigned char*)(x)))
  
  #ifdef TS_USE_WIDE
+ size_t		char2wchar(wchar_t *to, const char *from, size_t len);
  
  #ifdef WIN32
  
  size_t		wchar2char(char *to, const wchar_t *from, size_t len);
! 
  #else							/* WIN32 */
  
! /* correct wcstombs */
  #define wchar2char wcstombs
+ 
  #endif   /* WIN32 */
  
  #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
  
! #define COPYCHAR(d,s)	do {				\
! 	int lll = pg_mblen( s );			\
! 							\
! 	while( lll-- )					\
  		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\
  } while(0)
  
--- 56,65 ----
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
  
! #define COPYCHAR(d,s)	do {					\
! 	int lll = pg_mblen( s );					\
! 												\
! 	while( lll-- )								\
  		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\
  } while(0)
  
diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch	Thu Jan  1 03:00:00 1970
--- ./tsearch2.patch	Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c	Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c	Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+   	if (GetDatabaseEncoding() == PG_UTF8)
+   	{
+   		int			r;
+   
+ - 		if (len == 0)
+ - 			return 0;
+ - 
+   		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+   								NULL, NULL);
+   
+ --- 12,24 ----
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+ + 	if (len == 0)
+ + 		return 0;
+ + 
+   	if (GetDatabaseEncoding() == PG_UTF8)
+   	{
+   		int			r;
+   
+   		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+   								NULL, NULL);
+   
+ ***************
+ *** 34,50 ****
+   
+   	return wcstombs(to, from, len);
+   }
+   
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+   	if (GetDatabaseEncoding() == PG_UTF8)
+   	{
+   		int			r;
+   
+ - 		if (len == 0)
+ - 			return 0;
+ - 
+   		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+   
+   		if (!r)
+ --- 34,52 ----
+   
+   	return wcstombs(to, from, len);
+   }
+ + #endif   /* WIN32 */
+   
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+ + 	if (len == 0)
+ + 		return 0;
+ + 
+ + #ifdef WIN32
+   	if (GetDatabaseEncoding() == PG_UTF8)
+   	{
+   		int			r;
+   
+   		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+   
+   		if (!r)
+ ***************
+ *** 60,88 ****
+   
+   		return r;
+   	}
+   
+   	return mbstowcs(to, from, len);
+   }
+ - #endif   /* WIN32 */
+   
+   int
+   _t_isalpha(const char *ptr)
+   {
+ ! 	wchar_t		character;
+   
+ ! 	char2wchar(&character, ptr, 1);
+   
+ ! 	return iswalpha((wint_t) character);
+   }
+   
+   int
+   _t_isprint(const char *ptr)
+   {
+ ! 	wchar_t		character;
+   
+ ! 	char2wchar(&character, ptr, 1);
+   
+ ! 	return iswprint((wint_t) character);
+   }
+   #endif   /* TS_USE_WIDE */
+   
+ --- 62,105 ----
+   
+   		return r;
+   	}
+ + 	else 
+ + #endif /* WIN32 */
+ + 	if ( lc_ctype_is_c() )
+ + 	{
+ + 		/*
+ + 		 * pg_mb2wchar_with_len always adds trailing '\0', so 
+ + 		 * 'to' should be allocated with sufficient space 
+ + 		 */
+ + 		return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ + 	}
+   
+   	return mbstowcs(to, from, len);
+   }
+   
+   int
+   _t_isalpha(const char *ptr)
+   {
+ ! 	wchar_t		character[2];
+ ! 
+ ! 	if (lc_ctype_is_c())
+ ! 		return isalpha(TOUCHAR(ptr));
+   
+ ! 	char2wchar(character, ptr, 1);
+   
+ ! 	return iswalpha((wint_t) *character);
+   }
+   
+   int
+   _t_isprint(const char *ptr)
+   {
+ ! 	wchar_t		character[2];
+ ! 
+ ! 	if (lc_ctype_is_c())
+ ! 		return isprint(TOUCHAR(ptr));
+   
+ ! 	char2wchar(character, ptr, 1);
+   
+ ! 	return iswprint((wint_t) *character);
+   }
+   #endif   /* TS_USE_WIDE */
+   
+ ***************
+ *** 126,132 ****
+   		if ( wlen < 0 )
+   			ereport(ERROR,
+   					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! 					 errmsg("transalation failed from server encoding to wchar_t")));
+   
+   		Assert(wlen<=len);
+   		wstr[wlen] = 0;
+ --- 143,149 ----
+   		if ( wlen < 0 )
+   			ereport(ERROR,
+   					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! 					 errmsg("translation failed from server encoding to wchar_t")));
+   
+   		Assert(wlen<=len);
+   		wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+   		if ( wlen < 0 )
+   			ereport(ERROR,
+   					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! 					 errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+   		Assert(wlen<=len);
+   		out[wlen]='\0';
+   	}
+ --- 169,175 ----
+   		if ( wlen < 0 )
+   			ereport(ERROR,
+   					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! 					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
+   		Assert(wlen<=len);
+   		out[wlen]='\0';
+   	}
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h	Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h	Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+   #define TOUCHAR(x)	(*((unsigned char*)(x)))
+   
+   #ifdef TS_USE_WIDE
+   
+   #ifdef WIN32
+   
+   size_t		wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+   #else							/* WIN32 */
+   
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+   #define wchar2char wcstombs
+   #endif   /* WIN32 */
+   
+   #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+   #define TOUCHAR(x)	(*((unsigned char*)(x)))
+   
+   #ifdef TS_USE_WIDE
+ + size_t		char2wchar(wchar_t *to, const char *from, size_t len);
+   
+   #ifdef WIN32
+   
+   size_t		wchar2char(char *to, const wchar_t *from, size_t len);
+ ! 
+   #else							/* WIN32 */
+   
+ ! /* correct wcstombs */
+   #define wchar2char wcstombs
+ + 
+   #endif   /* WIN32 */
+   
+   #define t_isdigit(x)	( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+   
+ ! #define COPYCHAR(d,s)	do {				\
+ ! 	int lll = pg_mblen( s );			\
+ ! 							\
+ ! 	while( lll-- )					\
+   		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\
+   } while(0)
+   
+ --- 56,65 ----
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+   
+ ! #define COPYCHAR(d,s)	do {					\
+ ! 	int lll = pg_mblen( s );					\
+ ! 												\
+ ! 	while( lll-- )								\
+   		TOUCHAR((d)+lll) = TOUCHAR((s)+lll);	\
+   } while(0)
+   
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c	Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c	Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
  #ifdef TS_USE_WIDE
  
  	/*
! 	 * Use wide char code only when max encoding length > 1 and ctype != C.
! 	 * Some operating systems fail with multi-byte encodings and a C locale.
! 	 * Also, for a C locale there is no need to process as multibyte. From
! 	 * backend/utils/adt/oracle_compat.c Teodor
  	 */
  
! 	if (prs->charmaxlen > 1 && !lc_ctype_is_c())
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
--- 40,52 ----
  #ifdef TS_USE_WIDE
  
  	/*
! 	 * Use wide char code only when max encoding length > 1.
  	 */
  
! 	if (prs->charmaxlen > 1)
  	{
  		prs->usewide = true;
! 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
  		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
  	}
  	else
***************
*** 83,107 ****
  
  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales
   */
  
  #ifdef TS_USE_WIDE
  
! #define p_iswhat(type)										\
! static int											\
! p_is##type(TParser *prs) {									\
! 	Assert( prs->state );									\
! 	return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
! 		is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) );		\
! }	\
! 												\
! static int											\
! p_isnot##type(TParser *prs) {									\
! 	return !p_is##type(prs);								\
  }
  
  
  
  /* p_iseq should be used only for ascii symbols */
  
--- 80,178 ----
  
  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales. Note,
!  * that with multibyte encoding and C-locale isw* function may fail
!  * or give wrong result. Note 2: multibyte encoding and C-locale 
!  * often are used for Asian languages.
   */
  
  #ifdef TS_USE_WIDE
  
! #define p_iswhat(type)														\
! static int																	\
! p_is##type(TParser *prs) {													\
! 	Assert( prs->state );													\
! 	if ( prs->usewide )														\
! 	{																		\
! 		if ( lc_ctype_is_c() )												\
! 			return is##type( 0xff & *( prs->wstr + prs->state->poschar) );	\
! 																			\
! 		return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );	\
! 	}																		\
! 																			\
! 	return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) );	\
! }																			\
! 																			\
! static int																	\
! p_isnot##type(TParser *prs) {												\
! 	return !p_is##type(prs);												\
  }
  
+ static int 
+ p_isalnum(TParser *prs)
+ {
+ 	Assert( prs->state );
+ 
+ 	if (prs->usewide)
+ 	{
+ 		if (lc_ctype_is_c())
+ 		{
+ 			unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+ 
+ 			/*
+ 			 * any non-ascii symbol with multibyte encoding
+ 			 * with C-locale is an alpha character
+ 			 */
+ 			if ( c > 0x7f )
+ 				return 1;
+ 
+ 			return isalnum(0xff & c);
+ 		}
+ 
+ 		return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+ 	}
+ 
+ 	return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
  
+ static int
+ p_isnotalnum(TParser *prs)
+ {
+ 	return !p_isalnum(prs);
+ }
+ 
+ static int 
+ p_isalpha(TParser *prs)
+ {
+ 	Assert( prs->state );
+ 
+ 	if (prs->usewide)
+ 	{
+ 		if (lc_ctype_is_c())
+ 		{
+ 			unsigned int c = *(prs->wstr + prs->state->poschar);
+ 
+ 			/*
+ 			 * any non-ascii symbol with multibyte encoding
+ 			 * with C-locale is an alpha character
+ 			 */
+ 			if ( c > 0x7f )
+ 				return 1;
+ 
+ 			return isalpha(0xff & c);
+ 		}
+ 
+ 		return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+ 	}
+ 
+ 	return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+ 
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+ 	return !p_isalpha(prs);
+ }
  
  /* p_iseq should be used only for ascii symbols */
  
***************
*** 111,128 ****
  	Assert(prs->state);
  	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
  #else							/* TS_USE_WIDE */
  
! #define p_iswhat(type)										\
! static int											\
! p_is##type(TParser *prs) {									\
! 	Assert( prs->state );									\
! 	return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );			\
! }	\
! 												\
! static int											\
! p_isnot##type(TParser *prs) {									\
! 	return !p_is##type(prs);								\
  }
  
  
--- 182,200 ----
  	Assert(prs->state);
  	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
  }
+ 
  #else							/* TS_USE_WIDE */
  
! #define p_iswhat(type)														\
! static int																	\
! p_is##type(TParser *prs) {													\
! 	Assert( prs->state );													\
! 	return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );	\
! }																			\
! 																			\
! static int																	\
! p_isnot##type(TParser *prs) {												\
! 	return !p_is##type(prs);												\
  }
  
  
***************
*** 132,141 ****
  	Assert(prs->state);
  	return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
- #endif   /* TS_USE_WIDE */
  
  p_iswhat(alnum)
  p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 204,215 ----
  	Assert(prs->state);
  	return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
  
  p_iswhat(alnum)
  p_iswhat(alpha)
+ 
+ #endif   /* TS_USE_WIDE */
+ 
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
#9Tatsuo Ishii
ishii@postgresql.org
In reply to: Teodor Sigaev (#8)
Re: Request for review: tsearch2 patch

Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.

Ok, I see.

Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.

I have tested on a Linux box running PostgreSQL 8.2.1 (C locale,
EUC_JP encoding), and it worked great!

BTW, is your patch supposed to work with PostgreSQL 8.1?
--
Tatsuo Ishii
SRA OSS, Inc. Japan

Show quoted text

PS. Magnus, may I ask you to test under Windows? Thank you.

--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/