Request for review: tsearch2 patch
Hi,
Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Attachments:
tsearch2.patchtext/plain; charset=us-asciiDownload
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
--- ts_locale.c 1 Jan 2007 12:22:50 -0000
***************
*** 63,68 ****
--- 63,101 ----
return mbstowcs(to, from, len);
}
+
+ #else /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ wchar_t *result;
+ size_t n;
+
+ if (to == NULL)
+ return 0;
+
+ if (lc_ctype_is_c)
+ {
+ /* allocate neccesary memory for "to" including NULL terminate */
+ result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+
+ /* do the conversion */
+ n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ if (n > 0)
+ {
+ /* store the result */
+ if (n > len)
+ n = len;
+ memcpy(to, result, n*sizeof(wchar_t));
+ pfree(result);
+ *(to + n) = '\0';
+ }
+ return n;
+ }
+ return mbstowcs(to, from, len);
+ }
+
#endif /* WIN32 */
int
***************
*** 70,75 ****
--- 103,113 ----
{
wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isalpha(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
{
wchar_t character;
+ if (lc_ctype_is_c)
+ {
+ return isprint(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 195,201 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
--- ts_locale.h 1 Jan 2007 12:22:50 -0000
***************
*** 38,45 ****
#else /* WIN32 */
/* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
#else /* WIN32 */
/* correct mbstowcs */
#define wchar2char wcstombs
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
#define COPYCHAR(d,s) do { \
int lll = pg_mblen( s ); \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
--- wordparser/parser.c 1 Jan 2007 12:22:51 -0000
***************
*** 44,52 ****
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
+ *
+ * This is wrong assumption. even if locale is C, multibyte is necceary.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
--- 94,102 ----
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? \
! (lc_ctype_is_c? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
***************
*** 134,141 ****
}
#endif /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 138,197 ----
}
#endif /* TS_USE_WIDE */
! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c)
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.
Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Show quoted text
Hi,
Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Attachments:
tsearch.patchtext/plain; charset=us-asciiDownload
Index: ts_locale.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.c
*** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7
--- ts_locale.c 4 Jan 2007 12:16:00 -0000
***************
*** 63,68 ****
--- 63,101 ----
return mbstowcs(to, from, len);
}
+
+ #else /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ wchar_t *result;
+ size_t n;
+
+ if (to == NULL)
+ return 0;
+
+ if (lc_ctype_is_c())
+ {
+ /* allocate neccesary memory for "to" including NULL terminate */
+ result = (wchar_t *)palloc((len+1)*sizeof(wchar_t));
+
+ /* do the conversion */
+ n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len);
+ if (n > 0)
+ {
+ /* store the result */
+ if (n > len)
+ n = len;
+ memcpy(to, result, n*sizeof(wchar_t));
+ pfree(result);
+ *(to + n) = '\0';
+ }
+ return n;
+ }
+ return mbstowcs(to, from, len);
+ }
+
#endif /* WIN32 */
int
***************
*** 70,75 ****
--- 103,113 ----
{
wchar_t character;
+ if (lc_ctype_is_c())
+ {
+ return isalpha(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswalpha((wint_t) character);
***************
*** 80,85 ****
--- 118,128 ----
{
wchar_t character;
+ if (lc_ctype_is_c())
+ {
+ return isprint(TOUCHAR(ptr));
+ }
+
char2wchar(&character, ptr, 1);
return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 195,201 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
Index: ts_locale.h
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v
retrieving revision 1.7
diff -c -r1.7 ts_locale.h
*** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7
--- ts_locale.h 4 Jan 2007 12:16:00 -0000
***************
*** 38,45 ****
#else /* WIN32 */
/* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 38,46 ----
#else /* WIN32 */
/* correct mbstowcs */
#define wchar2char wcstombs
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 54,59 ****
--- 55,61 ----
* t_iseq() should be called only for ASCII symbols
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+ /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/
#define COPYCHAR(d,s) do { \
int lll = pg_mblen( s ); \
Index: wordparser/parser.c
===================================================================
RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v
retrieving revision 1.11
diff -c -r1.11 parser.c
*** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11
--- wordparser/parser.c 4 Jan 2007 12:16:01 -0000
***************
*** 44,52 ****
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
--- 44,54 ----
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
+ *
+ * This is wrong assumption. even if locale is C, multibyte is necceary.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
***************
*** 92,98 ****
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
--- 94,102 ----
static int \
p_is##type(TParser *prs) { \
Assert( prs->state ); \
! return ( ( prs->usewide ) ? \
! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \
! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
} \
\
***************
*** 134,141 ****
}
#endif /* TS_USE_WIDE */
! p_iswhat(alnum)
! p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 138,197 ----
}
#endif /* TS_USE_WIDE */
! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
Sorry for delay, I was on holidays :)
Did you test patch on Windows platform?
Tatsuo Ishii wrote:
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. JapanHi,
Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan------------------------------------------------------------------------
Index: ts_locale.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v retrieving revision 1.7 diff -c -r1.7 ts_locale.c *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 --- ts_locale.c 4 Jan 2007 12:16:00 -0000 *************** *** 63,68 **** --- 63,101 ----return mbstowcs(to, from, len); } + + #else /* WIN32 */ + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + wchar_t *result; + size_t n; + + if (to == NULL) + return 0; + + if (lc_ctype_is_c()) + { + /* allocate neccesary memory for "to" including NULL terminate */ + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); + + /* do the conversion */ + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); + if (n > 0) + { + /* store the result */ + if (n > len) + n = len; + memcpy(to, result, n*sizeof(wchar_t)); + pfree(result); + *(to + n) = '\0'; + } + return n; + } + return mbstowcs(to, from, len); + } + #endif /* WIN32 */int *************** *** 70,75 **** --- 103,113 ---- { wchar_t character;+ if (lc_ctype_is_c()) + { + return isalpha(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1);return iswalpha((wint_t) character); *************** *** 80,85 **** --- 118,128 ---- { wchar_t character;+ if (lc_ctype_is_c()) + { + return isprint(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1);return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));Assert(wlen<=len); wstr[wlen] = 0; --- 169,175 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from server encoding to wchar_t")));Assert(wlen<=len); wstr[wlen] = 0; *************** *** 152,158 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } --- 195,201 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } Index: ts_locale.h =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v retrieving revision 1.7 diff -c -r1.7 ts_locale.h *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 --- ts_locale.h 4 Jan 2007 12:16:00 -0000 *************** *** 38,45 **** #else /* WIN32 *//* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) --- 38,46 ---- #else /* WIN32 *//* correct mbstowcs */ #define wchar2char wcstombs + size_t char2wchar(wchar_t *to, const char *from, size_t len); + #endif /* WIN32 */#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) *************** *** 54,59 **** --- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/#define COPYCHAR(d,s) do { \ int lll = pg_mblen( s ); \ Index: wordparser/parser.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v retrieving revision 1.11 diff -c -r1.11 parser.c *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000 *************** *** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); --- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor + * + * This is wrong assumption. even if locale is C, multibyte is necceary. */! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); *************** *** 92,98 **** static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ --- 94,102 ---- static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? \ ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ *************** *** 134,141 **** } #endif /* TS_USE_WIDE */! p_iswhat(alnum) ! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print) --- 138,197 ---- } #endif /* TS_USE_WIDE */! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)------------------------------------------------------------------------
---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
choose an index scan if your joining column's datatypes do not
match
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
Sorry for delay, I was on holidays :)
Did you test patch on Windows platform?
No. I myself does not use Windows platform.
Do you have any concern on Windows regarding my patches?
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Show quoted text
Tatsuo Ishii wrote:
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.Teodor, Oleg, what do you think about these patches?
If ok, shall I commit to CVS head?
--
Tatsuo Ishii
SRA OSS, Inc. JapanHi,
Here are patches against tsearch2 with CVS head. Currently tsearch2
does not work with multibyte encoding which uses C locale. These
patches are intended to solve the problem by using PostgreSQL in-house
multibyte function instead of mbstowcs which does not work with C
locale. Also iswalpha etc. will not be called in case of C locale
since they are not working with it. Tested with the EUC_JP encoding
(should be working with any multibye encodings). Existing single byte
encodings should not be broken by the patches, I did not test though.
--
Tatsuo Ishii
SRA OSS, Inc. Japan------------------------------------------------------------------------
Index: ts_locale.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.c,v retrieving revision 1.7 diff -c -r1.7 ts_locale.c *** ts_locale.c 20 Nov 2006 14:03:30 -0000 1.7 --- ts_locale.c 4 Jan 2007 12:16:00 -0000 *************** *** 63,68 **** --- 63,101 ----return mbstowcs(to, from, len); } + + #else /* WIN32 */ + + size_t + char2wchar(wchar_t *to, const char *from, size_t len) + { + wchar_t *result; + size_t n; + + if (to == NULL) + return 0; + + if (lc_ctype_is_c()) + { + /* allocate neccesary memory for "to" including NULL terminate */ + result = (wchar_t *)palloc((len+1)*sizeof(wchar_t)); + + /* do the conversion */ + n = (size_t)pg_mb2wchar_with_len(from, (pg_wchar *)result, len); + if (n > 0) + { + /* store the result */ + if (n > len) + n = len; + memcpy(to, result, n*sizeof(wchar_t)); + pfree(result); + *(to + n) = '\0'; + } + return n; + } + return mbstowcs(to, from, len); + } + #endif /* WIN32 */int *************** *** 70,75 **** --- 103,113 ---- { wchar_t character;+ if (lc_ctype_is_c()) + { + return isalpha(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1);return iswalpha((wint_t) character); *************** *** 80,85 **** --- 118,128 ---- { wchar_t character;+ if (lc_ctype_is_c()) + { + return isprint(TOUCHAR(ptr)); + } + char2wchar(&character, ptr, 1);return iswprint((wint_t) character);
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));Assert(wlen<=len); wstr[wlen] = 0; --- 169,175 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from server encoding to wchar_t")));Assert(wlen<=len); wstr[wlen] = 0; *************** *** 152,158 **** if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("transalation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } --- 195,201 ---- if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } Index: ts_locale.h =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/ts_locale.h,v retrieving revision 1.7 diff -c -r1.7 ts_locale.h *** ts_locale.h 4 Oct 2006 00:29:47 -0000 1.7 --- ts_locale.h 4 Jan 2007 12:16:00 -0000 *************** *** 38,45 **** #else /* WIN32 *//* correct mbstowcs */
- #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) --- 38,46 ---- #else /* WIN32 *//* correct mbstowcs */ #define wchar2char wcstombs + size_t char2wchar(wchar_t *to, const char *from, size_t len); + #endif /* WIN32 */#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) *************** *** 54,59 **** --- 55,61 ---- * t_iseq() should be called only for ASCII symbols */ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) + /*#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)))*/#define COPYCHAR(d,s) do { \ int lll = pg_mblen( s ); \ Index: wordparser/parser.c =================================================================== RCS file: /cvsroot/pgsql/contrib/tsearch2/wordparser/parser.c,v retrieving revision 1.11 diff -c -r1.11 parser.c *** wordparser/parser.c 4 Oct 2006 00:29:47 -0000 1.11 --- wordparser/parser.c 4 Jan 2007 12:16:01 -0000 *************** *** 44,52 **** * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */! if (prs->charmaxlen > 1 && !lc_ctype_is_c()) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); --- 44,54 ---- * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor + * + * This is wrong assumption. even if locale is C, multibyte is necceary. */! if (prs->charmaxlen > 1) { prs->usewide = true; prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); *************** *** 92,98 **** static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ --- 94,102 ---- static int \ p_is##type(TParser *prs) { \ Assert( prs->state ); \ ! return ( ( prs->usewide ) ? \ ! (lc_ctype_is_c()? is##type( 0xff & *( prs->wstr + prs->state->poschar)): \ ! isw##type( (wint_t)*( prs->wstr + prs->state->poschar))): \ is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ } \ \ *************** *** 134,141 **** } #endif /* TS_USE_WIDE */! p_iswhat(alnum) ! p_iswhat(alpha) p_iswhat(digit) p_iswhat(lower) p_iswhat(print) --- 138,197 ---- } #endif /* TS_USE_WIDE */! static int p_isalnum(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalnum(0xff & c);
! }
! else
! return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalnum( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalnum(TParser *prs)
! {
! return !p_isalnum(prs);
! }
!
! static int p_isalpha(TParser *prs) {
! Assert( prs->state );
!
! if (prs->usewide)
! {
! unsigned int c;
!
! c = *(prs->wstr + prs->state->poschar);
!
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
! return isalpha(0xff & c);
! }
! else
! return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
! }
! else
! return isalpha( (unsigned char)*( prs->str + prs->state->posbyte ));
! }
!
! static int p_isnotalpha(TParser *prs)
! {
! return !p_isalpha(prs);
! }
!
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)------------------------------------------------------------------------
---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
choose an index scan if your joining column's datatypes do not
match--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.
Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
! static int p_isalnum(TParser *prs) {
...
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;
I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
From: Teodor Sigaev <teodor@sigaev.ru>
Subject: Re: [HACKERS] Request for review: tsearch2 patch
Date: Wed, 10 Jan 2007 18:50:44 +0300
Message-ID: <45A50B54.6090608@sigaev.ru>
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
It seems it's a bug with original tsearch2. Here is the patches.
------------------------------------------------------------------
*** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****
if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
--- 51,57 ----
if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
------------------------------------------------------------------
! static int p_isalnum(TParser *prs) {
...
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
I have tested with local-enabled environment and found a bug. Included
is the new version of patches.Your patch causes crash on tsearch2's installcheck with 'initdb -E UTF8 --locale
C', simple way to reproduce:
# select to_tsquery('default', '''New York''');
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
The connection to the server was lost. Attempting reset: Failed.
It seems it's a bug with original tsearch2. Here is the patches.
------------------------------------------------------------------
*** wordparser/parser.c~ 2007-01-07 09:54:39.000000000 +0900
--- wordparser/parser.c 2007-01-11 10:33:41.000000000 +0900
***************
*** 51,57 ****
if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
--- 51,57 ----
if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
------------------------------------------------------------------
! static int p_isalnum(TParser *prs) {
...
! if (lc_ctype_is_c())
! {
! if (c > 0x7f)
! return 1;I have some some doubts that any character greater than 0x7f is an alpha symbol.
Is it simple assumption or workaround?
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.
Ok, I see.
Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.
PS. Magnus, may I ask you to test under Windows? Thank you.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/
Attachments:
tsearch2.patchtext/plain; name=tsearch2.patchDownload
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
--- ./ts_locale.c Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
--- 12,24 ----
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
***************
*** 34,50 ****
return wcstombs(to, from, len);
}
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
- if (len == 0)
- return 0;
-
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
--- 34,52 ----
return wcstombs(to, from, len);
}
+ #endif /* WIN32 */
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
+ if (len == 0)
+ return 0;
+
+ #ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
***************
*** 60,88 ****
return r;
}
return mbstowcs(to, from, len);
}
- #endif /* WIN32 */
int
_t_isalpha(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswalpha((wint_t) character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character;
! char2wchar(&character, ptr, 1);
! return iswprint((wint_t) character);
}
#endif /* TS_USE_WIDE */
--- 62,105 ----
return r;
}
+ else
+ #endif /* WIN32 */
+ if ( lc_ctype_is_c() )
+ {
+ /*
+ * pg_mb2wchar_with_len always adds trailing '\0', so
+ * 'to' should be allocated with sufficient space
+ */
+ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ }
return mbstowcs(to, from, len);
}
int
_t_isalpha(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isalpha(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswalpha((wint_t) *character);
}
int
_t_isprint(const char *ptr)
{
! wchar_t character[2];
!
! if (lc_ctype_is_c())
! return isprint(TOUCHAR(ptr));
! char2wchar(character, ptr, 1);
! return iswprint((wint_t) *character);
}
#endif /* TS_USE_WIDE */
***************
*** 126,132 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
--- 143,149 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
***************
*** 152,158 ****
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
--- 169,175 ----
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
}
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
--- ./ts_locale.h Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
! size_t char2wchar(wchar_t *to, const char *from, size_t len);
#else /* WIN32 */
! /* correct mbstowcs */
! #define char2wchar mbstowcs
#define wchar2char wcstombs
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
#define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE
+ size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len);
!
#else /* WIN32 */
! /* correct wcstombs */
#define wchar2char wcstombs
+
#endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
! \
! while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
--- 56,65 ----
*/
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
! #define COPYCHAR(d,s) do { \
! int lll = pg_mblen( s ); \
! \
! while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0)
diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch Thu Jan 1 03:00:00 1970
--- ./tsearch2.patch Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ --- 12,24 ----
+ size_t
+ wchar2char(char *to, const wchar_t *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+ NULL, NULL);
+
+ ***************
+ *** 34,50 ****
+
+ return wcstombs(to, from, len);
+ }
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ - if (len == 0)
+ - return 0;
+ -
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ --- 34,52 ----
+
+ return wcstombs(to, from, len);
+ }
+ + #endif /* WIN32 */
+
+ size_t
+ char2wchar(wchar_t *to, const char *from, size_t len)
+ {
+ + if (len == 0)
+ + return 0;
+ +
+ + #ifdef WIN32
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ int r;
+
+ r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+
+ if (!r)
+ ***************
+ *** 60,88 ****
+
+ return r;
+ }
+
+ return mbstowcs(to, from, len);
+ }
+ - #endif /* WIN32 */
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswalpha((wint_t) character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character;
+
+ ! char2wchar(&character, ptr, 1);
+
+ ! return iswprint((wint_t) character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ --- 62,105 ----
+
+ return r;
+ }
+ + else
+ + #endif /* WIN32 */
+ + if ( lc_ctype_is_c() )
+ + {
+ + /*
+ + * pg_mb2wchar_with_len always adds trailing '\0', so
+ + * 'to' should be allocated with sufficient space
+ + */
+ + return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ + }
+
+ return mbstowcs(to, from, len);
+ }
+
+ int
+ _t_isalpha(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isalpha(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswalpha((wint_t) *character);
+ }
+
+ int
+ _t_isprint(const char *ptr)
+ {
+ ! wchar_t character[2];
+ !
+ ! if (lc_ctype_is_c())
+ ! return isprint(TOUCHAR(ptr));
+
+ ! char2wchar(character, ptr, 1);
+
+ ! return iswprint((wint_t) *character);
+ }
+ #endif /* TS_USE_WIDE */
+
+ ***************
+ *** 126,132 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from server encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ --- 143,149 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from server encoding to wchar_t")));
+
+ Assert(wlen<=len);
+ wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("transalation failed from wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ --- 169,175 ----
+ if ( wlen < 0 )
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ ! errmsg("translation failed from wchar_t to server encoding %d", errno)));
+ Assert(wlen<=len);
+ out[wlen]='\0';
+ }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t char2wchar(wchar_t *to, const char *from, size_t len);
+ #else /* WIN32 */
+
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+ #define wchar2char wcstombs
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+ #define TOUCHAR(x) (*((unsigned char*)(x)))
+
+ #ifdef TS_USE_WIDE
+ + size_t char2wchar(wchar_t *to, const char *from, size_t len);
+
+ #ifdef WIN32
+
+ size_t wchar2char(char *to, const wchar_t *from, size_t len);
+ !
+ #else /* WIN32 */
+
+ ! /* correct wcstombs */
+ #define wchar2char wcstombs
+ +
+ #endif /* WIN32 */
+
+ #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ ! \
+ ! while( lll-- ) \
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
+ --- 56,65 ----
+ */
+ #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
+
+ ! #define COPYCHAR(d,s) do { \
+ ! int lll = pg_mblen( s ); \
+ ! \
+ ! while( lll-- ) \
+ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
+ } while(0)
+
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1 and ctype != C.
! * Some operating systems fail with multi-byte encodings and a C locale.
! * Also, for a C locale there is no need to process as multibyte. From
! * backend/utils/adt/oracle_compat.c Teodor
*/
! if (prs->charmaxlen > 1 && !lc_ctype_is_c())
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
--- 40,52 ----
#ifdef TS_USE_WIDE
/*
! * Use wide char code only when max encoding length > 1.
*/
! if (prs->charmaxlen > 1)
{
prs->usewide = true;
! prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
}
else
***************
*** 83,107 ****
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \
! is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
/* p_iseq should be used only for ascii symbols */
--- 80,178 ----
/*
* defining support function, equvalent is* macroses, but
! * working with any possible encodings and locales. Note,
! * that with multibyte encoding and C-locale isw* function may fail
! * or give wrong result. Note 2: multibyte encoding and C-locale
! * often are used for Asian languages.
*/
#ifdef TS_USE_WIDE
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! if ( prs->usewide ) \
! { \
! if ( lc_ctype_is_c() ) \
! return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
! \
! return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
! } \
! \
! return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
+ static int
+ p_isalnum(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalnum(0xff & c);
+ }
+
+ return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+ static int
+ p_isnotalnum(TParser *prs)
+ {
+ return !p_isalnum(prs);
+ }
+
+ static int
+ p_isalpha(TParser *prs)
+ {
+ Assert( prs->state );
+
+ if (prs->usewide)
+ {
+ if (lc_ctype_is_c())
+ {
+ unsigned int c = *(prs->wstr + prs->state->poschar);
+
+ /*
+ * any non-ascii symbol with multibyte encoding
+ * with C-locale is an alpha character
+ */
+ if ( c > 0x7f )
+ return 1;
+
+ return isalpha(0xff & c);
+ }
+
+ return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+ }
+
+ return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+ return !p_isalpha(prs);
+ }
/* p_iseq should be used only for ascii symbols */
***************
*** 111,128 ****
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
#else /* TS_USE_WIDE */
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
--- 182,200 ----
Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
}
+
#else /* TS_USE_WIDE */
! #define p_iswhat(type) \
! static int \
! p_is##type(TParser *prs) { \
! Assert( prs->state ); \
! return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
! } \
! \
! static int \
! p_isnot##type(TParser *prs) { \
! return !p_is##type(prs); \
}
***************
*** 132,141 ****
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
- #endif /* TS_USE_WIDE */
p_iswhat(alnum)
p_iswhat(alpha)
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
--- 204,215 ----
Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
}
p_iswhat(alnum)
p_iswhat(alpha)
+
+ #endif /* TS_USE_WIDE */
+
p_iswhat(digit)
p_iswhat(lower)
p_iswhat(print)
Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.Ok, I see.
Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD
and 8.2 branches.
I have tested on a Linux box running PostgreSQL 8.2.1 (C locale,
EUC_JP encoding), and it worked great!
BTW, is your patch supposed to work with PostgreSQL 8.1?
--
Tatsuo Ishii
SRA OSS, Inc. Japan
Show quoted text
PS. Magnus, may I ask you to test under Windows? Thank you.
--
Teodor Sigaev E-mail: teodor@sigaev.ru
WWW: http://www.sigaev.ru/