fix for multi-byte partial truncating

Started by Tatsuo Ishiiover 27 years ago4 messages
#1Tatsuo Ishii
t-ishii@sra.co.jp

For varchar(n)/char(n) type, input string is silently truncated if it
is longer than n. A multi-byte letter consists of several bytes and
they should not be divided into pieces. Unconditional truncating
multi-byte letters would make partial multi-byte bytes.

Attached patches should fix the problem.

Index: backend/utils/adt/varchar.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/adt/varchar.c,v
retrieving revision 1.39
diff -c -r1.39 varchar.c
*** varchar.c	1998/09/01 04:32:53	1.39
--- varchar.c	1998/09/24 09:03:37
***************
*** 147,153 ****
--- 147,160 ----
  	if ((len == -1) || (len == VARSIZE(s)))
  		return s;
+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
+ 	len = rlen + VARHDRSZ;
+ #else
  	rlen = len - VARHDRSZ;
+ #endif
  	if (rlen > 4096)
  		elog(ERROR, "bpchar: length of char() must be less than 4096");
***************
*** 367,373 ****
--- 374,387 ----

/* only reach here if we need to truncate string... */

+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	len = pg_mbcliplen(VARDATA(s), slen - VARHDRSZ, slen - VARHDRSZ);
+ 	slen = len + VARHDRSZ;
+ #else
  	len = slen - VARHDRSZ;
+ #endif
  	if (len > 4096)
  		elog(ERROR, "varchar: length of varchar() must be less than 4096");
Index: backend/utils/mb/mbutils.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v
retrieving revision 1.3
diff -c -r1.3 mbutils.c
*** mbutils.c	1998/09/01 04:33:22	1.3
--- mbutils.c	1998/09/24 09:03:38
***************
*** 202,207 ****
--- 202,235 ----
  }
  /*
+  * returns the length of a multi-byte string
+  * (not necessarily  NULL terminated)
+  * that is not longer than limit.
+  * this function does not break multi-byte word boundary.
+  */
+ int
+ pg_mbcliplen(const unsigned char *mbstr, int len, int limit)
+ {
+ 	int			clen = 0;
+ 	int			l;
+ 
+ 	while (*mbstr &&  len > 0)
+ 	{
+ 		l = pg_mblen(mbstr);
+ 		if ((clen + l) > limit) {
+ 			break;
+ 		}
+ 		clen += l;
+ 		if (clen == limit) {
+ 			break;
+ 		}
+ 		len -= l;
+ 		mbstr += l;
+ 	}
+ 	return (clen);
+ }
+ 
+ /*
   * fuctions for utils/init
   */
  static int	DatabaseEncoding = MULTIBYTE;
Index: include/mb/pg_wchar.h
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.4
diff -c -r1.4 pg_wchar.h
*** pg_wchar.h	1998/09/01 04:36:34	1.4
--- pg_wchar.h	1998/09/24 09:03:42
***************
*** 103,108 ****
--- 103,109 ----
  extern int	pg_mic_mblen(const unsigned char *);
  extern int	pg_mbstrlen(const unsigned char *);
  extern int	pg_mbstrlen_with_len(const unsigned char *, int);
+ extern int	pg_mbcliplen(const unsigned char *, int, int);
  extern pg_encoding_conv_tbl *pg_get_encent_by_encoding(int);
  extern bool show_client_encoding(void);
  extern bool reset_client_encoding(void);
#2Bruce Momjian
maillist@candle.pha.pa.us
In reply to: Tatsuo Ishii (#1)
Re: [HACKERS] fix for multi-byte partial truncating

Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.

For varchar(n)/char(n) type, input string is silently truncated if it
is longer than n. A multi-byte letter consists of several bytes and
they should not be divided into pieces. Unconditional truncating
multi-byte letters would make partial multi-byte bytes.

Attached patches should fix the problem.

Index: backend/utils/adt/varchar.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/adt/varchar.c,v
retrieving revision 1.39
diff -c -r1.39 varchar.c
*** varchar.c	1998/09/01 04:32:53	1.39
--- varchar.c	1998/09/24 09:03:37
***************
*** 147,153 ****
--- 147,160 ----
if ((len == -1) || (len == VARSIZE(s)))
return s;
+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
+ 	len = rlen + VARHDRSZ;
+ #else
rlen = len - VARHDRSZ;
+ #endif
if (rlen > 4096)
elog(ERROR, "bpchar: length of char() must be less than 4096");
***************
*** 367,373 ****
--- 374,387 ----

/* only reach here if we need to truncate string... */

+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	len = pg_mbcliplen(VARDATA(s), slen - VARHDRSZ, slen - VARHDRSZ);
+ 	slen = len + VARHDRSZ;
+ #else
len = slen - VARHDRSZ;
+ #endif
if (len > 4096)
elog(ERROR, "varchar: length of varchar() must be less than 4096");
Index: backend/utils/mb/mbutils.c
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/backend/utils/mb/mbutils.c,v
retrieving revision 1.3
diff -c -r1.3 mbutils.c
*** mbutils.c	1998/09/01 04:33:22	1.3
--- mbutils.c	1998/09/24 09:03:38
***************
*** 202,207 ****
--- 202,235 ----
}
/*
+  * returns the length of a multi-byte string
+  * (not necessarily  NULL terminated)
+  * that is not longer than limit.
+  * this function does not break multi-byte word boundary.
+  */
+ int
+ pg_mbcliplen(const unsigned char *mbstr, int len, int limit)
+ {
+ 	int			clen = 0;
+ 	int			l;
+ 
+ 	while (*mbstr &&  len > 0)
+ 	{
+ 		l = pg_mblen(mbstr);
+ 		if ((clen + l) > limit) {
+ 			break;
+ 		}
+ 		clen += l;
+ 		if (clen == limit) {
+ 			break;
+ 		}
+ 		len -= l;
+ 		mbstr += l;
+ 	}
+ 	return (clen);
+ }
+ 
+ /*
* fuctions for utils/init
*/
static int	DatabaseEncoding = MULTIBYTE;
Index: include/mb/pg_wchar.h
===================================================================
RCS file: /usr/local/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.4
diff -c -r1.4 pg_wchar.h
*** pg_wchar.h	1998/09/01 04:36:34	1.4
--- pg_wchar.h	1998/09/24 09:03:42
***************
*** 103,108 ****
--- 103,109 ----
extern int	pg_mic_mblen(const unsigned char *);
extern int	pg_mbstrlen(const unsigned char *);
extern int	pg_mbstrlen_with_len(const unsigned char *, int);
+ extern int	pg_mbcliplen(const unsigned char *, int, int);
extern pg_encoding_conv_tbl *pg_get_encent_by_encoding(int);
extern bool show_client_encoding(void);
extern bool reset_client_encoding(void);
-- 
  Bruce Momjian                        |  maillist@candle.pha.pa.us
  830 Blythe Avenue                    |  http://www.op.net/~candle
  Drexel Hill, Pennsylvania 19026      |  (610) 353-9879(w)
  +  If your life is a hard drive,     |  (610) 853-3000(h)
  +  Christ can be your backup.        |  
#3Tatsuo Ishii
t-ishii@sra.co.jp
In reply to: Bruce Momjian (#2)
Re: [HACKERS] fix for multi-byte partial truncating

Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.

Thank you, Bruce. Everything seems OK too.

But I found a mistake with my patches. bpchar does not pad blanks
anymore! Could you apply following patches to
backend/utils/adt/varchar.c? (the diff is against the current source
tree)

*** varchar.c.orig	Fri Sep 25 15:12:34 1998
--- varchar.c	Fri Sep 25 17:59:47 1998
***************
*** 147,160 ****
  	if ((len == -1) || (len == VARSIZE(s)))
  		return s;

- #ifdef MULTIBYTE
- /* truncate multi-byte string in a way not to break
- multi-byte boundary */
- rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
- len = rlen + VARHDRSZ;
- #else
rlen = len - VARHDRSZ;
- #endif

  	if (rlen > 4096)
  		elog(ERROR, "bpchar: length of char() must be less than 4096");
--- 147,153 ----
***************
*** 167,173 ****
--- 160,172 ----
  	result = (char *) palloc(len);
  	VARSIZE(result) = len;
  	r = VARDATA(result);
+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	slen = pg_mbcliplen(VARDATA(s), rlen, rlen);
+ #else
  	slen = VARSIZE(s) - VARHDRSZ;
+ #endif
  	s = VARDATA(s);

#ifdef STRINGDEBUG

#4Bruce Momjian
maillist@candle.pha.pa.us
In reply to: Tatsuo Ishii (#3)
Re: [HACKERS] fix for multi-byte partial truncating

Applied.

Applied, but for some reason patch did not like the normal cvs/rcs diff
format. Not sure why. Please check to see it is OK. Looks OK here.

Thank you, Bruce. Everything seems OK too.

But I found a mistake with my patches. bpchar does not pad blanks
anymore! Could you apply following patches to
backend/utils/adt/varchar.c? (the diff is against the current source
tree)

*** varchar.c.orig	Fri Sep 25 15:12:34 1998
--- varchar.c	Fri Sep 25 17:59:47 1998
***************
*** 147,160 ****
if ((len == -1) || (len == VARSIZE(s)))
return s;

- #ifdef MULTIBYTE
- /* truncate multi-byte string in a way not to break
- multi-byte boundary */
- rlen = pg_mbcliplen(VARDATA(s), len - VARHDRSZ, len - VARHDRSZ);
- len = rlen + VARHDRSZ;
- #else
rlen = len - VARHDRSZ;
- #endif

if (rlen > 4096)
elog(ERROR, "bpchar: length of char() must be less than 4096");
--- 147,153 ----
***************
*** 167,173 ****
--- 160,172 ----
result = (char *) palloc(len);
VARSIZE(result) = len;
r = VARDATA(result);
+ #ifdef MULTIBYTE
+ 	/* truncate multi-byte string in a way not to break
+ 	   multi-byte boundary */
+ 	slen = pg_mbcliplen(VARDATA(s), rlen, rlen);
+ #else
slen = VARSIZE(s) - VARHDRSZ;
+ #endif
s = VARDATA(s);

#ifdef STRINGDEBUG

-- 
  Bruce Momjian                        |  http://www.op.net/~candle
  maillist@candle.pha.pa.us            |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026