fix for multi-byte partial truncating
For varchar(n)/char(n) type, input string is silently truncated if it
is longer than n. A multi-byte letter consists of several bytes and
they should not be divided into pieces. Unconditional truncating
multi-byte letters would make partial multi-byte bytes.
Attached patches should fix the problem.
Index: backend/utils/adt/varchar.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/adt/varchar.c,v
retrieving revision 1.39
diff -c -r1.39 varchar.c
*** varchar.c 1998/09/01 04:32:53 1.39
--- varchar.c 1998/09/24 09:03:37
***************
*** 147,153 ****
--- 147,160 ----
if ((len == -1) || (len == VARSIZE(s)))
return s;
+ #ifdef MULTIBYTE
+ /* truncate multi-byte string in a way not to break
+ multi-byte boundary */
+ rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
+ len = rlen + VARHDRSZ;
+ #else
rlen = len - VARHDRSZ;
+ #endif
if (rlen > 4096)
elog(ERROR, "bpchar: length of char() must be less than 4096");
***************
*** 367,373 ****
--- 374,387 ----
/* only reach here if we need to truncate string... */
+ #ifdef MULTIBYTE
+ /* truncate multi-byte string in a way not to break
+ multi-byte boundary */
+ len = pg_mbcliplen(VARDATA(s), slen - VARHDRSZ, slen - VARHDRSZ);
+ slen = len + VARHDRSZ;
+ #else
len = slen - VARHDRSZ;
+ #endif
if (len > 4096)
elog(ERROR, "varchar: length of varchar() must be less than 4096");
Index: backend/utils/mb/mbutils.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v
retrieving revision 1.3
diff -c -r1.3 mbutils.c
*** mbutils.c 1998/09/01 04:33:22 1.3
--- mbutils.c 1998/09/24 09:03:38
***************
*** 202,207 ****
--- 202,235 ----
}
/*
+ * returns the length of a multi-byte string
+ * (not necessarily NULL terminated)
+ * that is not longer than limit.
+ * this function does not break multi-byte word boundary.
+ */
+ int
+ pg_mbcliplen(const unsigned char *mbstr, int len, int limit)
+ {
+ int clen = 0;
+ int l;
+
+ while (*mbstr && len > 0)
+ {
+ l = pg_mblen(mbstr);
+ if ((clen + l) > limit) {
+ break;
+ }
+ clen += l;
+ if (clen == limit) {
+ break;
+ }
+ len -= l;
+ mbstr += l;
+ }
+ return (clen);
+ }
+
+ /*
* fuctions for utils/init
*/
static int DatabaseEncoding = MULTIBYTE;
Index: include/mb/pg_wchar.h
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.4
diff -c -r1.4 pg_wchar.h
*** pg_wchar.h 1998/09/01 04:36:34 1.4
--- pg_wchar.h 1998/09/24 09:03:42
***************
*** 103,108 ****
--- 103,109 ----
extern int pg_mic_mblen(const unsigned char *);
extern int pg_mbstrlen(const unsigned char *);
extern int pg_mbstrlen_with_len(const unsigned char *, int);
+ extern int pg_mbcliplen(const unsigned char *, int, int);
extern pg_encoding_conv_tbl *pg_get_encent_by_encoding(int);
extern bool show_client_encoding(void);
extern bool reset_client_encoding(void);
Import Notes
Reply to msg id not found: YourmessageofThu24Sep1998060356GMT.3609E0CC.5C0CD61A@alumni.caltech.edu
Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.
For varchar(n)/char(n) type, input string is silently truncated if it
is longer than n. A multi-byte letter consists of several bytes and
they should not be divided into pieces. Unconditional truncating
multi-byte letters would make partial multi-byte bytes.Attached patches should fix the problem.
Index: backend/utils/adt/varchar.c =================================================================== RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/adt/varchar.c,v retrieving revision 1.39 diff -c -r1.39 varchar.c *** varchar.c 1998/09/01 04:32:53 1.39 --- varchar.c 1998/09/24 09:03:37 *************** *** 147,153 **** --- 147,160 ---- if ((len == -1) || (len == VARSIZE(s))) return s;+ #ifdef MULTIBYTE + /* truncate multi-byte string in a way not to break + multi-byte boundary */ + rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ); + len = rlen + VARHDRSZ; + #else rlen = len - VARHDRSZ; + #endifif (rlen > 4096) elog(ERROR, "bpchar: length of char() must be less than 4096"); *************** *** 367,373 **** --- 374,387 ----/* only reach here if we need to truncate string... */
+ #ifdef MULTIBYTE + /* truncate multi-byte string in a way not to break + multi-byte boundary */ + len = pg_mbcliplen(VARDATA(s), slen - VARHDRSZ, slen - VARHDRSZ); + slen = len + VARHDRSZ; + #else len = slen - VARHDRSZ; + #endifif (len > 4096) elog(ERROR, "varchar: length of varchar() must be less than 4096"); Index: backend/utils/mb/mbutils.c =================================================================== RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v retrieving revision 1.3 diff -c -r1.3 mbutils.c *** mbutils.c 1998/09/01 04:33:22 1.3 --- mbutils.c 1998/09/24 09:03:38 *************** *** 202,207 **** --- 202,235 ---- }/* + * returns the length of a multi-byte string + * (not necessarily NULL terminated) + * that is not longer than limit. + * this function does not break multi-byte word boundary. + */ + int + pg_mbcliplen(const unsigned char *mbstr, int len, int limit) + { + int clen = 0; + int l; + + while (*mbstr && len > 0) + { + l = pg_mblen(mbstr); + if ((clen + l) > limit) { + break; + } + clen += l; + if (clen == limit) { + break; + } + len -= l; + mbstr += l; + } + return (clen); + } + + /* * fuctions for utils/init */ static int DatabaseEncoding = MULTIBYTE; Index: include/mb/pg_wchar.h =================================================================== RCS file: /usr/local/cvsroot/pgsql/src/include/mb/pg_wchar.h,v retrieving revision 1.4 diff -c -r1.4 pg_wchar.h *** pg_wchar.h 1998/09/01 04:36:34 1.4 --- pg_wchar.h 1998/09/24 09:03:42 *************** *** 103,108 **** --- 103,109 ---- extern int pg_mic_mblen(const unsigned char *); extern int pg_mbstrlen(const unsigned char *); extern int pg_mbstrlen_with_len(const unsigned char *, int); + extern int pg_mbcliplen(const unsigned char *, int, int); extern pg_encoding_conv_tbl *pg_get_encent_by_encoding(int); extern bool show_client_encoding(void); extern bool reset_client_encoding(void);
--
Bruce Momjian | maillist@candle.pha.pa.us
830 Blythe Avenue | http://www.op.net/~candle
Drexel Hill, Pennsylvania 19026 | (610) 353-9879(w)
+ If your life is a hard drive, | (610) 853-3000(h)
+ Christ can be your backup. |
Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.
Thank you, Bruce. Everything seems OK too.
But I found a mistake with my patches. bpchar does not pad blanks
anymore! Could you apply following patches to
backend/utils/adt/varchar.c? (the diff is against the current source
tree)
*** varchar.c.orig Fri Sep 25 15:12:34 1998
--- varchar.c Fri Sep 25 17:59:47 1998
***************
*** 147,160 ****
if ((len == -1) || (len == VARSIZE(s)))
return s;
- #ifdef MULTIBYTE
- /* truncate multi-byte string in a way not to break
- multi-byte boundary */
- rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
- len = rlen + VARHDRSZ;
- #else
rlen = len - VARHDRSZ;
- #endif
if (rlen > 4096)
elog(ERROR, "bpchar: length of char() must be less than 4096");
--- 147,153 ----
***************
*** 167,173 ****
--- 160,172 ----
result = (char *) palloc(len);
VARSIZE(result) = len;
r = VARDATA(result);
+ #ifdef MULTIBYTE
+ /* truncate multi-byte string in a way not to break
+ multi-byte boundary */
+ slen = pg_mbcliplen(VARDATA(s), rlen, rlen);
+ #else
slen = VARSIZE(s) - VARHDRSZ;
+ #endif
s = VARDATA(s);
#ifdef STRINGDEBUG
Import Notes
Reply to msg id not found: YourmessageofThu24Sep1998214715-0400.199809250147.VAA22725@candle.pha.pa.us | Resolved by subject fallback
Applied.
Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.Thank you, Bruce. Everything seems OK too.
But I found a mistake with my patches. bpchar does not pad blanks
anymore! Could you apply following patches to
backend/utils/adt/varchar.c? (the diff is against the current source
tree)*** varchar.c.orig Fri Sep 25 15:12:34 1998 --- varchar.c Fri Sep 25 17:59:47 1998 *************** *** 147,160 **** if ((len == -1) || (len == VARSIZE(s))) return s;- #ifdef MULTIBYTE
- /* truncate multi-byte string in a way not to break
- multi-byte boundary */
- rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
- len = rlen + VARHDRSZ;
- #else
rlen = len - VARHDRSZ;
- #endifif (rlen > 4096) elog(ERROR, "bpchar: length of char() must be less than 4096"); --- 147,153 ---- *************** *** 167,173 **** --- 160,172 ---- result = (char *) palloc(len); VARSIZE(result) = len; r = VARDATA(result); + #ifdef MULTIBYTE + /* truncate multi-byte string in a way not to break + multi-byte boundary */ + slen = pg_mbcliplen(VARDATA(s), rlen, rlen); + #else slen = VARSIZE(s) - VARHDRSZ; + #endif s = VARDATA(s);#ifdef STRINGDEBUG
--
Bruce Momjian | http://www.op.net/~candle
maillist@candle.pha.pa.us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026