proposal: unescape_text function

Started by Pavel Stehuleover 5 years ago29 messages

pavel.stehule@gmail.com

over 5 years ago

There is one user request for unescape function in core.

https://stackoverflow.com/questions/20124393/convert-escaped-unicode-character-back-to-actual-character-in-postgresql/20125412?noredirect=1#comment110502526_20125412

This request is about possibility that we do with string literal via
functional interface instead string literals only

I wrote plpgsql function, but built in function can be simpler:

CREATE OR REPLACE FUNCTION public.unescape(text, text)
RETURNS text
LANGUAGE plpgsql
AS $function$
DECLARE result text;
BEGIN
EXECUTE format('SELECT U&%s UESCAPE %s',
quote_literal(replace($1, '\u','^')),
quote_literal($2)) INTO result;
RETURN result;
END;
$function$

postgres=# select unescape('Odpov\u011Bdn\u00E1 osoba','^');
unescape -----------------
Odpovědná osoba(1 row)

What do you think about this?

Regards

Pavel

Pavel Stehule

pavel.stehule@gmail.com

over 5 years ago

In reply to: Pavel Stehule (#1)

1 attachment(s)

Re: proposal: unescape_text function

po 22. 6. 2020 v 5:48 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

Hi

There is one user request for unescape function in core.

https://stackoverflow.com/questions/20124393/convert-escaped-unicode-character-back-to-actual-character-in-postgresql/20125412?noredirect=1#comment110502526_20125412

This request is about possibility that we do with string literal via
functional interface instead string literals only

I wrote plpgsql function, but built in function can be simpler:

CREATE OR REPLACE FUNCTION public.unescape(text, text)
RETURNS text
LANGUAGE plpgsql
AS $function$
DECLARE result text;
BEGIN
EXECUTE format('SELECT U&%s UESCAPE %s',
quote_literal(replace($1, '\u','^')),
quote_literal($2)) INTO result;
RETURN result;
END;
$function$

postgres=# select unescape('Odpov\u011Bdn\u00E1 osoba','^');
unescape -----------------
Odpovědná osoba(1 row)

What do you think about this?

I changed the name to more accurately "unicode_unescape". Patch is assigned

Regards

Pavel

Show quoted text

Regards

Pavel

Attachments:

unicode_unescape.patchtext/x-patch; charset=UTF-8; name=unicode_unescape.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b7c450ea29..365ea17946 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3533,6 +3533,24 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_unescape</primary>
+        </indexterm>
+        <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits) to chars.
+       </para>
+       <para>
+        <literal>unicode_unescape('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..c7f94298c1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -26,7 +26,6 @@
 #include "parser/parser.h"
 #include "parser/scansup.h"
 
-static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);
 
@@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
-	if (isxdigit(escape)
-		|| escape == '+'
-		|| escape == '\''
-		|| escape == '"'
-		|| scanner_isspace(escape))
-		return false;
-	else
-		return true;
-}
-
 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 18169ec4f4..5a39edf450 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -228,3 +228,41 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+Oid
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231..2934a1d9da 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6139,3 +6139,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * Process Unicode escapes in "str"
+ *
+ * escape: the escape character to use
+ */
+static void
+udeescape(StringInfo str, const char *instr, size_t len, char escape)
+{
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+	while (len > 0)
+	{
+		if (instr[0] == escape)
+		{
+			if (len >= 2 &&
+				instr[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(str, escape);
+				instr += 2;
+				len -= 2;
+			}
+			else if (len >= 5 &&
+					 isxdigit((unsigned char) instr[1]) &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[1]) << 12) +
+					(hexval(instr[2]) << 8) +
+					(hexval(instr[3]) << 4) +
+					hexval(instr[4]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 5;
+				len -= 5;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]) &&
+					 isxdigit((unsigned char) instr[5]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[2]) << 20) +
+					(hexval(instr[3]) << 16) +
+					(hexval(instr[4]) << 12) +
+					(hexval(instr[5]) << 8) +
+					(hexval(instr[6]) << 4) +
+					hexval(instr[7]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 8;
+				len -= 8;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	return;
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
+
+/*
+ * Unescape unicode strings
+ */
+Datum
+unicode_uescape_with_escape_char(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *escchr_text;
+	text	   *result;
+	const char *escchr_ptr;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	if (PG_ARGISNULL(1))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("null value not allowed for escape char")));
+
+	escchr_text = PG_GETARG_TEXT_PP(1);
+	escchr_ptr = VARDATA_ANY(escchr_text);
+
+	if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape character")));
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  *escchr_ptr);
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_uescape(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  '\\');
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 61f2c2f5b4..42792fca3c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10936,4 +10936,11 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '1136', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text',
+  proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' },
+
+{ oid => '1137', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unicode_uescape' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 7a6ee529ae..e1dc7b8a2a 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -15,6 +15,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *scanstr(const char *s);
 
 extern char *downcase_truncate_identifier(const char *ident, int len,
@@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern Oid check_unicode_value(pg_wchar c);
+
+extern bool check_uescapechar(unsigned char escape);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..deb67b566b 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,15 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unicode_unescape('\0441\043B\043E\043D');
+ unicode_unescape 
+------------------
+ слон
+(1 row)
+
+SELECT unicode_unescape('d!0061t!+000061', '!');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..fd99031a1a 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,6 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unicode_unescape('\0441\043B\043E\043D');
+SELECT unicode_unescape('d!0061t!+000061', '!');

Daniel Gustafsson

daniel@yesql.se

over 5 years ago

In reply to: Pavel Stehule (#2)

Re: proposal: unescape_text function

On 23 Jun 2020, at 11:51, Pavel Stehule <pavel.stehule@gmail.com> wrote:

I changed the name to more accurately "unicode_unescape". Patch is assigned

You've made this function return Oid, where it used to be void. Was that a
copy-paste mistake? Else the code needs fixing as it doesn't return an Oid.

+Oid
+check_unicode_value(pg_wchar c)
+{
+   if (!is_valid_unicode_codepoint(c))
+       ereport(ERROR,
+               (errcode(ERRCODE_SYNTAX_ERROR),
+                errmsg("invalid Unicode escape value")));
+}

cheers ./daniel

Pavel Stehule

pavel.stehule@gmail.com

over 5 years ago

In reply to: Daniel Gustafsson (#3)

1 attachment(s)

Re: proposal: unescape_text function

čt 2. 7. 2020 v 17:27 odesílatel Daniel Gustafsson <daniel@yesql.se> napsal:

On 23 Jun 2020, at 11:51, Pavel Stehule <pavel.stehule@gmail.com> wrote:

I changed the name to more accurately "unicode_unescape". Patch is

assigned

You've made this function return Oid, where it used to be void. Was that a
copy-paste mistake? Else the code needs fixing as it doesn't return an Oid.
+Oid
+check_unicode_value(pg_wchar c)
+{
+   if (!is_valid_unicode_codepoint(c))
+       ereport(ERROR,
+               (errcode(ERRCODE_SYNTAX_ERROR),
+                errmsg("invalid Unicode escape value")));
+}

yes, it is my error

I am sending fixed patch

Thank you for check

Pavel

cheers ./daniel

Show quoted text

Attachments:

unicode_uescape-20200702.patchtext/x-patch; charset=UTF-8; name=unicode_uescape-20200702.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index f065856535..6aecdf1641 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3539,6 +3539,24 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_unescape</primary>
+        </indexterm>
+        <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits) to chars.
+       </para>
+       <para>
+        <literal>unicode_unescape('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..c7f94298c1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -26,7 +26,6 @@
 #include "parser/parser.h"
 #include "parser/scansup.h"
 
-static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);
 
@@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
-	if (isxdigit(escape)
-		|| escape == '+'
-		|| escape == '\''
-		|| escape == '"'
-		|| scanner_isspace(escape))
-		return false;
-	else
-		return true;
-}
-
 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index cac70d5df7..9d3173bc6d 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -218,3 +218,41 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+void
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index df10bfb906..ce8373c417 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6139,3 +6139,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * Process Unicode escapes in "str"
+ *
+ * escape: the escape character to use
+ */
+static void
+udeescape(StringInfo str, const char *instr, size_t len, char escape)
+{
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+	while (len > 0)
+	{
+		if (instr[0] == escape)
+		{
+			if (len >= 2 &&
+				instr[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(str, escape);
+				instr += 2;
+				len -= 2;
+			}
+			else if (len >= 5 &&
+					 isxdigit((unsigned char) instr[1]) &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[1]) << 12) +
+					(hexval(instr[2]) << 8) +
+					(hexval(instr[3]) << 4) +
+					hexval(instr[4]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 5;
+				len -= 5;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]) &&
+					 isxdigit((unsigned char) instr[5]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[2]) << 20) +
+					(hexval(instr[3]) << 16) +
+					(hexval(instr[4]) << 12) +
+					(hexval(instr[5]) << 8) +
+					(hexval(instr[6]) << 4) +
+					hexval(instr[7]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 8;
+				len -= 8;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	return;
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
+
+/*
+ * Unescape unicode strings
+ */
+Datum
+unicode_uescape_with_escape_char(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *escchr_text;
+	text	   *result;
+	const char *escchr_ptr;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	if (PG_ARGISNULL(1))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("null value not allowed for escape char")));
+
+	escchr_text = PG_GETARG_TEXT_PP(1);
+	escchr_ptr = VARDATA_ANY(escchr_text);
+
+	if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape character")));
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  *escchr_ptr);
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_uescape(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  '\\');
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 38295aca48..fd3f84fb49 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10948,4 +10948,11 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '1136', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text',
+  proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' },
+
+{ oid => '1137', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unicode_uescape' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 7a6ee529ae..ca2b6d249e 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -15,6 +15,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *scanstr(const char *s);
 
 extern char *downcase_truncate_identifier(const char *ident, int len,
@@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern void check_unicode_value(pg_wchar c);
+
+extern bool check_uescapechar(unsigned char escape);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..deb67b566b 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,15 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unicode_unescape('\0441\043B\043E\043D');
+ unicode_unescape 
+------------------
+ слон
+(1 row)
+
+SELECT unicode_unescape('d!0061t!+000061', '!');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..fd99031a1a 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,6 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unicode_unescape('\0441\043B\043E\043D');
+SELECT unicode_unescape('d!0061t!+000061', '!');

Asif Rehman

asifr.rehman@gmail.com

over 5 years ago

In reply to: Pavel Stehule (#4)

Re: proposal: unescape_text function

On Thu, Jul 2, 2020 at 10:10 PM Pavel Stehule <pavel.stehule@gmail.com>
wrote:

čt 2. 7. 2020 v 17:27 odesílatel Daniel Gustafsson <daniel@yesql.se>
napsal:
On 23 Jun 2020, at 11:51, Pavel Stehule <pavel.stehule@gmail.com>

wrote:

I changed the name to more accurately "unicode_unescape". Patch is

assigned

You've made this function return Oid, where it used to be void. Was that
a
copy-paste mistake? Else the code needs fixing as it doesn't return an
Oid.
+Oid
+check_unicode_value(pg_wchar c)
+{
+   if (!is_valid_unicode_codepoint(c))
+       ereport(ERROR,
+               (errcode(ERRCODE_SYNTAX_ERROR),
+                errmsg("invalid Unicode escape value")));
+}
yes, it is my error

I am sending fixed patch

Thank you for check

Pavel

cheers ./daniel

Hi Pavel,

Since the idea originated from unescaping unicode string literals i.e.
select unescape('Odpov\u011Bdn\u00E1 osoba');

Shouldn't the built-in function support the above syntax as well?

--
Asif Rehman
Highgo Software (Canada/China/Pakistan)
URL : www.highgo.ca

Pavel Stehule

pavel.stehule@gmail.com

over 5 years ago

In reply to: Asif Rehman (#5)

1 attachment(s)

Re: proposal: unescape_text function

Hi Pavel,

Since the idea originated from unescaping unicode string literals i.e.
select unescape('Odpov\u011Bdn\u00E1 osoba');

Shouldn't the built-in function support the above syntax as well?

good idea. The prefixes u (4 digits) and U (8 digits) are supported

Regards

Pavel

Show quoted text

--
Asif Rehman
Highgo Software (Canada/China/Pakistan)
URL : www.highgo.ca

Attachments:

unicode_unescape-20200729.patchtext/x-patch; charset=UTF-8; name=unicode_unescape-20200729.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 959f6a1c2f..126d3483e6 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3539,6 +3539,38 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_unescape</primary>
+        </indexterm>
+        <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits), with prefix
+        <literal>u</literal> (4 digits) or with prefix
+        <literal>U</literal> (8 digits) to chars.
+       </para>
+       <para>
+        <literal>unicode_unescape('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d\0061t\+000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d!0061t!+000061', '!')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d\u0061t\U00000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..c7f94298c1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -26,7 +26,6 @@
 #include "parser/parser.h"
 #include "parser/scansup.h"
 
-static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);
 
@@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
-	if (isxdigit(escape)
-		|| escape == '+'
-		|| escape == '\''
-		|| escape == '"'
-		|| scanner_isspace(escape))
-		return false;
-	else
-		return true;
-}
-
 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index cac70d5df7..9d3173bc6d 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -218,3 +218,41 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+void
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index df10bfb906..5ca9817708 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6139,3 +6139,256 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * First four chars should be hexnum digits
+ */
+static bool
+isxdigit_four(const char *instr)
+{
+	return isxdigit((unsigned char)  instr[0]) &&
+			isxdigit((unsigned char) instr[1]) &&
+			isxdigit((unsigned char) instr[2]) &&
+			isxdigit((unsigned char) instr[3]);
+}
+
+/*
+ * Translate string with hexadecimal digits to number
+ */
+static long int
+hexval_four(const char *instr)
+{
+	return (hexval(instr[0]) << 12) +
+			(hexval(instr[1]) << 8) +
+			(hexval(instr[2]) << 4) +
+			 hexval(instr[3]);
+}
+
+/*
+ * Process Unicode escapes in "str"
+ *
+ * escape: the escape character to use
+ */
+static void
+udeescape(StringInfo str, const char *instr, size_t len, char escape)
+{
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+	while (len > 0)
+	{
+		if (instr[0] == escape)
+		{
+			if (len >= 2 &&
+				instr[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(str, escape);
+				instr += 2;
+				len -= 2;
+			}
+			else if ((len >= 5 && isxdigit_four(&instr[1])) ||
+					 (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2])))
+			{
+				pg_wchar	unicode;
+				int			offset = instr[1] == 'u' ? 2 : 1;
+
+				unicode = hexval_four(instr + offset);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 4 + offset;
+				len -= 4 + offset;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 8) +
+								(hexval(instr[6]) << 4) +
+								 hexval(instr[7]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 8;
+				len -= 8;
+			}
+			else if (len >= 10 &&
+					 instr[1] == 'U' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit_four(&instr[6]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 10;
+				len -= 10;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	return;
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
+
+/*
+ * Unescape unicode strings
+ */
+Datum
+unicode_uescape_with_escape_char(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *escchr_text;
+	text	   *result;
+	const char *escchr_ptr;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	if (PG_ARGISNULL(1))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("null value not allowed for escape char")));
+
+	escchr_text = PG_GETARG_TEXT_PP(1);
+	escchr_ptr = VARDATA_ANY(escchr_text);
+
+	if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape character")));
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  *escchr_ptr);
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_uescape(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  '\\');
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 082a11f270..75359c113a 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10954,4 +10954,11 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '1136', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text',
+  proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' },
+
+{ oid => '1137', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unicode_uescape' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 7a6ee529ae..ca2b6d249e 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -15,6 +15,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *scanstr(const char *s);
 
 extern char *downcase_truncate_identifier(const char *ident, int len,
@@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern void check_unicode_value(pg_wchar c);
+
+extern bool check_uescapechar(unsigned char escape);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..03d43358e5 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,36 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unicode_unescape('\0441\043B\043E\043D');
+ unicode_unescape 
+------------------
+ слон
+(1 row)
+
+SELECT unicode_unescape('d!0061t!+000061', '!');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
+SELECT unicode_unescape('d\u0061t\U00000061');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
+-- run-time error
+SELECT unicode_unescape('wrong: \db99');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \db99\0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \+00db99\+000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \+2FFFFF');
+ERROR:  invalid Unicode escape value
+SELECT unicode_unescape('wrong: \udb99\u0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \U0000db99\U00000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \U002FFFFF');
+ERROR:  invalid Unicode escape value
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..ba7d61e73a 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,16 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unicode_unescape('\0441\043B\043E\043D');
+SELECT unicode_unescape('d!0061t!+000061', '!');
+SELECT unicode_unescape('d\u0061t\U00000061');
+
+-- run-time error
+SELECT unicode_unescape('wrong: \db99');
+SELECT unicode_unescape('wrong: \db99\0061');
+SELECT unicode_unescape('wrong: \+00db99\+000061');
+SELECT unicode_unescape('wrong: \+2FFFFF');
+SELECT unicode_unescape('wrong: \udb99\u0061');
+SELECT unicode_unescape('wrong: \U0000db99\U00000061');
+SELECT unicode_unescape('wrong: \U002FFFFF');

Asif Rehman

asifr.rehman@gmail.com

over 5 years ago

In reply to: Pavel Stehule (#6)

Re: proposal: unescape_text function

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: tested, passed
Documentation: tested, passed

Hi,

The patch looks good to me.

The new status of this patch is: Ready for Committer

Pavel Stehule

pavel.stehule@gmail.com

over 5 years ago

In reply to: Pavel Stehule (#6)

1 attachment(s)

Re: proposal: unescape_text function

st 29. 7. 2020 v 8:18 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

Hi

Hi Pavel,

Since the idea originated from unescaping unicode string literals i.e.
select unescape('Odpov\u011Bdn\u00E1 osoba');

Shouldn't the built-in function support the above syntax as well?

good idea. The prefixes u (4 digits) and U (8 digits) are supported

Regards

rebase

Regards

Pavel

Show quoted text

Pavel

--
Asif Rehman
Highgo Software (Canada/China/Pakistan)
URL : www.highgo.ca

Attachments:

unicode_unescape-20201007.patchtext/x-patch; charset=UTF-8; name=unicode_unescape-20201007.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e7cff980dd..93fa238280 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3570,6 +3570,38 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_unescape</primary>
+        </indexterm>
+        <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits), with prefix
+        <literal>u</literal> (4 digits) or with prefix
+        <literal>U</literal> (8 digits) to chars.
+       </para>
+       <para>
+        <literal>unicode_unescape('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d\0061t\+000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d!0061t!+000061', '!')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unicode_unescape('d\u0061t\U00000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..c7f94298c1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -26,7 +26,6 @@
 #include "parser/parser.h"
 #include "parser/scansup.h"
 
-static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);
 
@@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
-	if (isxdigit(escape)
-		|| escape == '+'
-		|| escape == '\''
-		|| escape == '"'
-		|| scanner_isspace(escape))
-		return false;
-	else
-		return true;
-}
-
 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index d07cbafcee..a2d3f629e9 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -125,3 +125,41 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+void
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index d7bc330541..add6ac3d18 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6237,3 +6237,256 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * First four chars should be hexnum digits
+ */
+static bool
+isxdigit_four(const char *instr)
+{
+	return isxdigit((unsigned char)  instr[0]) &&
+			isxdigit((unsigned char) instr[1]) &&
+			isxdigit((unsigned char) instr[2]) &&
+			isxdigit((unsigned char) instr[3]);
+}
+
+/*
+ * Translate string with hexadecimal digits to number
+ */
+static long int
+hexval_four(const char *instr)
+{
+	return (hexval(instr[0]) << 12) +
+			(hexval(instr[1]) << 8) +
+			(hexval(instr[2]) << 4) +
+			 hexval(instr[3]);
+}
+
+/*
+ * Process Unicode escapes in "str"
+ *
+ * escape: the escape character to use
+ */
+static void
+udeescape(StringInfo str, const char *instr, size_t len, char escape)
+{
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+	while (len > 0)
+	{
+		if (instr[0] == escape)
+		{
+			if (len >= 2 &&
+				instr[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(str, escape);
+				instr += 2;
+				len -= 2;
+			}
+			else if ((len >= 5 && isxdigit_four(&instr[1])) ||
+					 (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2])))
+			{
+				pg_wchar	unicode;
+				int			offset = instr[1] == 'u' ? 2 : 1;
+
+				unicode = hexval_four(instr + offset);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 4 + offset;
+				len -= 4 + offset;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 8) +
+								(hexval(instr[6]) << 4) +
+								 hexval(instr[7]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 8;
+				len -= 8;
+			}
+			else if (len >= 10 &&
+					 instr[1] == 'U' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit_four(&instr[6]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+
+				instr += 10;
+				len -= 10;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	return;
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
+
+/*
+ * Unescape unicode strings
+ */
+Datum
+unicode_uescape_with_escape_char(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *escchr_text;
+	text	   *result;
+	const char *escchr_ptr;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	if (PG_ARGISNULL(1))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("null value not allowed for escape char")));
+
+	escchr_text = PG_GETARG_TEXT_PP(1);
+	escchr_ptr = VARDATA_ANY(escchr_text);
+
+	if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape character")));
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  *escchr_ptr);
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_uescape(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  '\\');
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d6f3e2d286..9b88af11b7 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10980,4 +10980,11 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '9822', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text',
+  proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' },
+
+{ oid => '9823', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unicode_uescape' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 5bc426660d..11de185b56 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -14,6 +14,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *downcase_truncate_identifier(const char *ident, int len,
 										  bool warn);
 
@@ -24,4 +26,10 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern void check_unicode_value(pg_wchar c);
+
+extern bool check_uescapechar(unsigned char escape);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..03d43358e5 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,36 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unicode_unescape('\0441\043B\043E\043D');
+ unicode_unescape 
+------------------
+ слон
+(1 row)
+
+SELECT unicode_unescape('d!0061t!+000061', '!');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
+SELECT unicode_unescape('d\u0061t\U00000061');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
+-- run-time error
+SELECT unicode_unescape('wrong: \db99');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \db99\0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \+00db99\+000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \+2FFFFF');
+ERROR:  invalid Unicode escape value
+SELECT unicode_unescape('wrong: \udb99\u0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \U0000db99\U00000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unicode_unescape('wrong: \U002FFFFF');
+ERROR:  invalid Unicode escape value
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..ba7d61e73a 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,16 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unicode_unescape('\0441\043B\043E\043D');
+SELECT unicode_unescape('d!0061t!+000061', '!');
+SELECT unicode_unescape('d\u0061t\U00000061');
+
+-- run-time error
+SELECT unicode_unescape('wrong: \db99');
+SELECT unicode_unescape('wrong: \db99\0061');
+SELECT unicode_unescape('wrong: \+00db99\+000061');
+SELECT unicode_unescape('wrong: \+2FFFFF');
+SELECT unicode_unescape('wrong: \udb99\u0061');
+SELECT unicode_unescape('wrong: \U0000db99\U00000061');
+SELECT unicode_unescape('wrong: \U002FFFFF');

Peter Eisentraut

peter.eisentraut@enterprisedb.com

about 5 years ago

In reply to: Pavel Stehule (#8)

Re: proposal: unescape_text function

On 2020-10-07 11:00, Pavel Stehule wrote:

Since the idea originated from unescaping unicode string
literals i.e.
select unescape('Odpov\u011Bdn\u00E1 osoba');

Shouldn't the built-in function support the above syntax as well?

good idea. The prefixes u (4 digits) and U (8 digits) are supported

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to use this?

#10

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Peter Eisentraut (#9)

Re: proposal: unescape_text function

pá 27. 11. 2020 v 15:37 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-10-07 11:00, Pavel Stehule wrote:

Since the idea originated from unescaping unicode string
literals i.e.
select unescape('Odpov\u011Bdn\u00E1 osoba');

Shouldn't the built-in function support the above syntax as well?

good idea. The prefixes u (4 digits) and U (8 digits) are supported

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to use
this?

Some corporate data can be in CSV format with escaped unicode characters.
Without this function it is not possible to decode these files without
external application.

Postgres has support for this conversion, but only for string literals.

Because unicode is major encoding, I think this conversion should be
supported. There is another question about implementation like in this
patch implemented unicode_unescape function, or with some new conversion.
Using conversion
https://www.postgresql.org/docs/current/sql-createconversion.html is
probably better, but I am not sure how intuitive it is, and it is hard to
use it (without not nice workarounds) in plpgsql.

I don't expect so Postgres should produce data in unicode escaped format,
but can be useful, if Postgres can do some work with data in special format
of major encoding.

postgres=# select convert_from(E'Odpov\u011Bdn\u00E1 osoba', 'UTF8');
┌─────────────────┐
│ convert_from │
╞═════════════════╡
│ Odpovědná osoba │
└─────────────────┘
(1 row)

I can do this with bytea, but it is hard to use it with text fields.

I didn't find any way how to do it without ugly steps.

Regards

Pavel

#11

Peter Eisentraut

peter.eisentraut@enterprisedb.com

about 5 years ago

In reply to: Pavel Stehule (#10)

Re: proposal: unescape_text function

On 2020-11-29 18:36, Pavel Stehule wrote:

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to
use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only have
one stackoverflow question, and then this implementation, and they are
not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

#12

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Peter Eisentraut (#11)

Re: proposal: unescape_text function

po 30. 11. 2020 v 14:14 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-11-29 18:36, Pavel Stehule wrote:

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to
use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only have
one stackoverflow question, and then this implementation, and they are
not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java, JavaScript
or RTF - https://billposer.org/Software/ListOfRepresentations.html

In some languages (Python), there is decoder "unicode-escape". Java has a
method escapeJava, for conversion from unicode to ascii. I can imagine so
these data are from Java systems exported to 8bit strings - so this
implementation can be accepted as referential. This format is used by
https://docs.oracle.com/javase/8/docs/technotes/tools/unix/native2ascii.html
tool too.

Postgres can decode this format too, and the patch is based on Postgres
implementation. I just implemented a different interface.

Currently decode function does only text->bytea transformation. Maybe a
more generic function "decode_text" and "encode_text" for similar cases can
be better (here we need text->text transformation). But it looks like
overengineering now.

Maybe we introduce new encoding "ascii" and we can implement new
conversions "ascii_to_utf8" and "utf8_to_ascii". It looks like the most
clean solution. What do you think about it?

Regards

Pavel

#13

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Pavel Stehule (#12)

Re: proposal: unescape_text function

po 30. 11. 2020 v 22:15 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

po 30. 11. 2020 v 14:14 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-11-29 18:36, Pavel Stehule wrote:

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to
use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only have
one stackoverflow question, and then this implementation, and they are
not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

In some languages (Python), there is decoder "unicode-escape". Java has a
method escapeJava, for conversion from unicode to ascii. I can imagine so
these data are from Java systems exported to 8bit strings - so this
implementation can be accepted as referential. This format is used by
https://docs.oracle.com/javase/8/docs/technotes/tools/unix/native2ascii.html
tool too.

Postgres can decode this format too, and the patch is based on Postgres
implementation. I just implemented a different interface.

Currently decode function does only text->bytea transformation. Maybe a
more generic function "decode_text" and "encode_text" for similar cases can
be better (here we need text->text transformation). But it looks like
overengineering now.

Maybe we introduce new encoding "ascii" and we can implement new
conversions "ascii_to_utf8" and "utf8_to_ascii". It looks like the most
clean solution. What do you think about it?

a better name of new encoding can be "unicode-escape" than "ascii". We use
"to_ascii" function for different use case.

set client_encoding to unicode-escape;
copy tab from xxx;
...

but it doesn't help when only a few columns from the table are in
unicode-escape format.

Show quoted text

Regards

Pavel

#14

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Pavel Stehule (#13)

Re: proposal: unescape_text function

po 30. 11. 2020 v 22:56 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

po 30. 11. 2020 v 22:15 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

po 30. 11. 2020 v 14:14 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-11-29 18:36, Pavel Stehule wrote:

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to
use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only have
one stackoverflow question, and then this implementation, and they are
not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

In some languages (Python), there is decoder "unicode-escape". Java has
a method escapeJava, for conversion from unicode to ascii. I can imagine so
these data are from Java systems exported to 8bit strings - so this
implementation can be accepted as referential. This format is used by
https://docs.oracle.com/javase/8/docs/technotes/tools/unix/native2ascii.html
tool too.

Postgres can decode this format too, and the patch is based on Postgres
implementation. I just implemented a different interface.

Currently decode function does only text->bytea transformation. Maybe a
more generic function "decode_text" and "encode_text" for similar cases can
be better (here we need text->text transformation). But it looks like
overengineering now.

Maybe we introduce new encoding "ascii" and we can implement new
conversions "ascii_to_utf8" and "utf8_to_ascii". It looks like the most
clean solution. What do you think about it?

a better name of new encoding can be "unicode-escape" than "ascii". We use
"to_ascii" function for different use case.

set client_encoding to unicode-escape;
copy tab from xxx;
...

but it doesn't help when only a few columns from the table are in
unicode-escape format.

probably the most complete solution can be from two steps:

1. introducing new encoding - "ascii_unicode_escape" with related
conversions
2. introducing two new functions - text_escape and text_unescape with two
parameters - source text and conversion name

select text_convert_to('Тимати', 'ascii_unicode_escape')
\u0422\u0438\u043c\u0430\u0442\u0438 .. result is text

select text_convert_from('\u0422\u0438\u043c\u0430\u0442\u0438',
'ascii_unicode_escape')
┌──────────┐
│ ?column? │
╞══════════╡
│ Тимати │
└──────────┘
(1 row)

Show quoted text

Regards

Pavel

#15

Chapman Flack

chap@anastigmatix.net

about 5 years ago

In reply to: Pavel Stehule (#14)

Re: proposal: unescape_text function

po 30. 11. 2020 v 22:15 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

If I look on that page, it appears that RTF is using a similar-looking
escape but in decimal rather than hex.

It would be important to define what is done with non-BMP characters?
Will there be another escape for a six- or eight-hexdigit format for
the codepoint, or will it be represented by two four-hexdigit escapes
for consecutive UTF-16 surrogates?

Regards,
-Chap

#16

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Chapman Flack (#15)

Re: proposal: unescape_text function

út 1. 12. 2020 v 20:20 odesílatel Chapman Flack <chap@anastigmatix.net>
napsal:

po 30. 11. 2020 v 22:15 odesílatel Pavel Stehule <

pavel.stehule@gmail.com>

napsal:

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

If I look on that page, it appears that RTF is using a similar-looking
escape but in decimal rather than hex.

It would be important to define what is done with non-BMP characters?
Will there be another escape for a six- or eight-hexdigit format for
the codepoint, or will it be represented by two four-hexdigit escapes
for consecutive UTF-16 surrogates?

the detection of decimal or hexadecimal codes can be a hard problem -
string "12" is valid in both systems, but the numbers are different. So
there should be external specification as an argument.

Regards

Pavel

Show quoted text

Regards,
-Chap

#17

Andrew Dunstan

andrew@dunslane.net

about 5 years ago

In reply to: Peter Eisentraut (#11)

Re: proposal: unescape_text function

On 11/30/20 8:14 AM, Peter Eisentraut wrote:

On 2020-11-29 18:36, Pavel Stehule wrote:

    I don't really get the point of this function. There is AFAICT no
    function to produce this escaped format, and it's not a recognized
    interchange format. So under what circumstances would one need to
    use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only
have one stackoverflow question, and then this implementation, and
they are not even the same format. My worry is that if there is not
precise specification, then people are going to want to add things in
the future, and there will be no way to analyze such requests in a
principled way.

Also, should this be an extension? I'm dubious about including such
marginal uses in the core code unless there's a really good case for it.

cheers

andrew

#18

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Andrew Dunstan (#17)

Re: proposal: unescape_text function

st 2. 12. 2020 v 0:05 odesílatel Andrew Dunstan <andrew@dunslane.net>
napsal:

On 11/30/20 8:14 AM, Peter Eisentraut wrote:

On 2020-11-29 18:36, Pavel Stehule wrote:

I don't really get the point of this function. There is AFAICT no
function to produce this escaped format, and it's not a recognized
interchange format. So under what circumstances would one need to
use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode these
files without external application.

I would like some supporting documentation on this. So far we only
have one stackoverflow question, and then this implementation, and
they are not even the same format. My worry is that if there is not
precise specification, then people are going to want to add things in
the future, and there will be no way to analyze such requests in a
principled way.

Also, should this be an extension? I'm dubious about including such
marginal uses in the core code unless there's a really good case for it.

I am not sure, and I am inclined so it should be core functionality.

1. Although this use case is marginal, this is related to most used
encodings - ascii and unicode. 8 bit encodings enhanced about escaped
multibyte chars will be used for a very long time. Unfortunately - this
will be worse, because Postgres will be used more in the corporate
environment, where there is a bigger press to conserve very legacy
technologies without correct multibyte support. The core problem so this
issue is out of concept bytea -> text or text -> bytea transformations
supported by Postgres. This is text -> text transformation (for almost all
encoding based on ascii), that is not supported by Postgres now.

2. Postgres already has this functionality - but unfortunately there is a
limit just only literal constants.

create or replace function uunescape(text)
returns text as $$
declare r text;
begin
-- don't use this code!!!
execute 'select e''' || $1 || '''' into r;
return r;
end;
$$ language plpgsql immutable;

But one way how anybody can use it is SQL injection vulnerable and slow. So
some simple buildin solution can be protection against some future security
issues. Personally I am happy with just this limited function that will be
safe (although the design based on introducing new encoding and conversions
can be more complete and accurate). I agree so this case is marginal, but
it is a fully valid use case, and supporting unicode escaped codes just by
parser is a needless limit.

3. there are new disadvantages of extensions in current DBaaS times. Until
the extension is not directly accepted by a cloud provider, then the
extension is not available for users. The acceptance of extensions is not
too agile - so moving this code to extension doesn't solve this problem.
Without DBaaS the implementation of this feature as the extensions can be
good enough.

Regards

Pavel

Show quoted text

cheers

andrew

#19

Peter Eisentraut

peter.eisentraut@enterprisedb.com

about 5 years ago

In reply to: Pavel Stehule (#12)

Re: proposal: unescape_text function

On 2020-11-30 22:15, Pavel Stehule wrote:

I would like some supporting documentation on this. So far we only
have
one stackoverflow question, and then this implementation, and they are
not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

Heh. The fact that there is a table of two dozen possible
representations kind of proves my point that we should be deliberate in
picking one.

I do see Oracle unistr() on that list, which appears to be very similar
to what you are trying to do here. Maybe look into aligning with that.

#20

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Peter Eisentraut (#19)

Re: proposal: unescape_text function

st 2. 12. 2020 v 9:23 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-11-30 22:15, Pavel Stehule wrote:

I would like some supporting documentation on this. So far we only
have
one stackoverflow question, and then this implementation, and they

are

not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

Heh. The fact that there is a table of two dozen possible
representations kind of proves my point that we should be deliberate in
picking one.

I do see Oracle unistr() on that list, which appears to be very similar
to what you are trying to do here. Maybe look into aligning with that.

unistr is a primitive form of proposed function. But it can be used as a
base. The format is compatible with our "4.1.2.3. String Constants with
Unicode Escapes".

What do you think about the following proposal?

1. unistr(text) .. compatible with Postgres unicode escapes - it is
enhanced against Oracle, because Oracle's unistr doesn't support 6 digits
unicodes.

2. there can be optional parameter "prefix" with default "\". But with "\u"
it can be compatible with Java or Python.

What do you think about it?

Pavel

#21

Andrew Dunstan

andrew@dunslane.net

about 5 years ago

In reply to: Pavel Stehule (#18)

Re: proposal: unescape_text function

On 12/2/20 12:48 AM, Pavel Stehule wrote:

st 2. 12. 2020 v 0:05 odesílatel Andrew Dunstan <andrew@dunslane.net
<mailto:andrew@dunslane.net>> napsal:

On 11/30/20 8:14 AM, Peter Eisentraut wrote:

On 2020-11-29 18:36, Pavel Stehule wrote:

    I don't really get the point of this function. There is

AFAICT no

    function to produce this escaped format, and it's not a

recognized

    interchange format. So under what circumstances would one

need to

    use this?

Some corporate data can be in CSV format with escaped unicode
characters. Without this function it is not possible to decode

these

files without external application.

I would like some supporting documentation on this. So far we only
have one stackoverflow question, and then this implementation, and
they are not even the same format. My worry is that if there is not
precise specification, then people are going to want to add

things in

the future, and there will be no way to analyze such requests in a
principled way.

Also, should this be an extension? I'm dubious about including such
marginal uses in the core code unless there's a really good case
for it.

[...]

3. there are new disadvantages of extensions in current DBaaS times.
Until the extension is not directly accepted by a cloud provider, then
the extension is not available for users. The acceptance of extensions
is not too agile - so moving this code to extension doesn't solve this
problem. Without DBaaS the implementation of this feature as the
extensions can be good enough.

That argument can apply to any extension someone wants to use. If your
DBaaS provider doesn't support some extension you need to lobby them or
find another that does support it, rather than try to put it in core
code. Some extensions, such as untrusted PLs, will naturally almost
never be supported by DBaaS providers because they are inherently
unsafe. That's not the case here.

cheers

andrew

#22

Chapman Flack

chap@anastigmatix.net

about 5 years ago

In reply to: Pavel Stehule (#20)

Re: proposal: unescape_text function

On 12/02/20 05:37, Pavel Stehule wrote:

2. there can be optional parameter "prefix" with default "\". But with "\u"
it can be compatible with Java or Python.

Java's unicode escape form is one of those early ones that lack
a six-digit form, and where any character outside of the basic multilingual
plane has to be represented by two four-digit escapes in a row, encoding
the two surrogates that would make up the character's representation
in UTF-16.

Obviously that's an existing form that's out there, so it's not a bad
thing to have some kind of support for it, but it's not a great
representation to encourage people to use.

Python, by contrast, has both \uxxxx and \Uxxxxxxxx where you would use
the latter to represent a non-BMP character directly. So the Java and
Python schemes should be considered distinct.

In Perl, there is a useful extension to regexp substitution where
you specify the replacement not as a string or even a string with &
and \1 \2 ... magic, but as essentially a lambda that is passed the
match and returns a computed replacement. That makes conversions of
the sort discussed here generally trivial to implement. Would it be
worth considering to add something of general utility like that, and
then there could be a small library of pure SQL functions (or a wiki
page or GitHub gist) covering a bunch of the two dozen representations
on that page linked above?

Regards,
-Chap

#23

Chapman Flack

chap@anastigmatix.net

about 5 years ago

In reply to: Chapman Flack (#22)

Re: proposal: unescape_text function

On 12/02/20 09:55, Chapman Flack wrote:

In Perl, there is a useful extension to regexp substitution where
you specify the replacement not as a string or even a string with &
and \1 \2 ... magic, but as essentially a lambda that is passed the
match and returns a computed replacement. That makes conversions of
the sort discussed here generally trivial to implement.

Python, I should have added, allows that also. Java too, since release 9.

Regards,
-Chap

#24

Pavel Stehule

pavel.stehule@gmail.com

about 5 years ago

In reply to: Pavel Stehule (#20)

1 attachment(s)

Re: proposal: unescape_text function

st 2. 12. 2020 v 11:37 odesílatel Pavel Stehule <pavel.stehule@gmail.com>
napsal:

st 2. 12. 2020 v 9:23 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 2020-11-30 22:15, Pavel Stehule wrote:

I would like some supporting documentation on this. So far we only
have
one stackoverflow question, and then this implementation, and they

are

not even the same format. My worry is that if there is not precise
specification, then people are going to want to add things in the
future, and there will be no way to analyze such requests in a
principled way.

I checked this and it is "prefix backslash-u hex" used by Java,
JavaScript or RTF -
https://billposer.org/Software/ListOfRepresentations.html

Heh. The fact that there is a table of two dozen possible
representations kind of proves my point that we should be deliberate in
picking one.

I do see Oracle unistr() on that list, which appears to be very similar
to what you are trying to do here. Maybe look into aligning with that.

unistr is a primitive form of proposed function. But it can be used as a
base. The format is compatible with our "4.1.2.3. String Constants with
Unicode Escapes".

What do you think about the following proposal?

1. unistr(text) .. compatible with Postgres unicode escapes - it is
enhanced against Oracle, because Oracle's unistr doesn't support 6 digits
unicodes.

2. there can be optional parameter "prefix" with default "\". But with
"\u" it can be compatible with Java or Python.

What do you think about it?

I thought about it a little bit more, and the prefix specification has not
too much sense (more if we implement this functionality as function
"unistr"). I removed the optional argument and renamed the function to
"unistr". The functionality is the same. Now it supports Oracle convention,
Java and Python (for Python UXXXXXXXX) and \+XXXXXX. These formats was
already supported. The compatibility witth Oracle is nice.

postgres=# select
'Arabic : ' || unistr( '\0627\0644\0639\0631\0628\064A\0629' ) ||
'
Chinese : ' || unistr( '\4E2D\6587' ) ||
'
English : ' || unistr( 'English' ) ||
'
French : ' || unistr( 'Fran\00E7ais' ) ||
'
German : ' || unistr( 'Deutsch' ) ||
'
Greek : ' || unistr( '\0395\03BB\03BB\03B7\03BD\03B9\03BA\03AC' ) ||
'
Hebrew : ' || unistr( '\05E2\05D1\05E8\05D9\05EA' ) ||
'
Japanese : ' || unistr( '\65E5\672C\8A9E' ) ||
'
Korean : ' || unistr( '\D55C\AD6D\C5B4' ) ||
'
Portuguese : ' || unistr( 'Portugu\00EAs' ) ||
'
Russian : ' || unistr( '\0420\0443\0441\0441\043A\0438\0439' ) ||
'
Spanish : ' || unistr( 'Espa\00F1ol' ) ||
'
Thai : ' || unistr( '\0E44\0E17\0E22' )
as unicode_test_string;
┌──────────────────────────┐
│ unicode_test_string │
╞══════════════════════════╡
│ Arabic : العربية ↵│
│ Chinese : 中文 ↵│
│ English : English ↵│
│ French : Français ↵│
│ German : Deutsch ↵│
│ Greek : Ελληνικά ↵│
│ Hebrew : עברית ↵│
│ Japanese : 日本語 ↵│
│ Korean : 한국어 ↵│
│ Portuguese : Português↵│
│ Russian : Русский ↵│
│ Spanish : Español ↵│
│ Thai : ไทย │
└──────────────────────────┘
(1 row)

postgres=# SELECT UNISTR('Odpov\u011Bdn\u00E1 osoba');
┌─────────────────┐
│ unistr │
╞═════════════════╡
│ Odpovědná osoba │
└─────────────────┘
(1 row)

New patch attached

Regards

Pavel

Show quoted text

Pavel

Attachments:

unistr.patchtext/x-patch; charset=UTF-8; name=unistr.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index df29af6371..6ad8136523 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3553,6 +3553,34 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unistr</primary>
+        </indexterm>
+        <function>unistr</function> ( <parameter>string</parameter> <type>text</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits) without prefix or
+         with prefix <literal>u</literal> (4 digits) or with prefix
+        <literal>U</literal> (8 digits) to chars or with prefix
+        <literal>+</literal> (6 digits).
+       </para>
+       <para>
+        <literal>unistr('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para>
+       <para>
+        <literal>unistr('d\0061t\+000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para>
+       <para>
+        <literal>unistr('d\u0061t\U00000061')</literal>
+        <returnvalue>data</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..cbddb61396 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -278,30 +278,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
 /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
 static bool
 check_uescapechar(unsigned char escape)
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index d07cbafcee..b39dde12bd 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -125,3 +125,27 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+void
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index ff9bf238f3..e16f0875d6 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6290,3 +6290,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * First four chars should be hexnum digits
+ */
+static bool
+isxdigit_four(const char *instr)
+{
+	return isxdigit((unsigned char)  instr[0]) &&
+			isxdigit((unsigned char) instr[1]) &&
+			isxdigit((unsigned char) instr[2]) &&
+			isxdigit((unsigned char) instr[3]);
+}
+
+/*
+ * Translate string with hexadecimal digits to number
+ */
+static long int
+hexval_four(const char *instr)
+{
+	return (hexval(instr[0]) << 12) +
+			(hexval(instr[1]) << 8) +
+			(hexval(instr[2]) << 4) +
+			 hexval(instr[3]);
+}
+
+/*
+ * Replaces unicode escape sequences by unicode chars
+ */
+Datum
+unistr(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+	char	   *instr;
+	int			len;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+	instr = VARDATA_ANY(input_text);
+	len = VARSIZE_ANY_EXHDR(input_text);
+
+	initStringInfo(&str);
+
+	while (len > 0)
+	{
+		if (instr[0] == '\\')
+		{
+			if (len >= 2 &&
+				instr[1] == '\\')
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(&str, '\\');
+				instr += 2;
+				len -= 2;
+			}
+			else if ((len >= 5 && isxdigit_four(&instr[1])) ||
+					 (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2])))
+			{
+				pg_wchar	unicode;
+				int			offset = instr[1] == 'u' ? 2 : 1;
+
+				unicode = hexval_four(instr + offset);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(&str, cbuf);
+				}
+
+				instr += 4 + offset;
+				len -= 4 + offset;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 8) +
+								(hexval(instr[6]) << 4) +
+								 hexval(instr[7]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(&str, cbuf);
+				}
+
+				instr += 8;
+				len -= 8;
+			}
+			else if (len >= 10 &&
+					 instr[1] == 'U' &&
+					 isxdigit_four(&instr[2]) &&
+					 isxdigit_four(&instr[6]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]);
+
+				check_unicode_value(unicode);
+
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(&str, cbuf);
+				}
+
+				instr += 10;
+				len -= 10;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(&str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index fc2202b843..92149b9cc2 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11010,4 +11010,7 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '9822', descr => 'unescape Unicode chars in strings',
+  proname => 'unistr', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unistr' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 5bc426660d..1481c1da01 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -14,6 +14,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *downcase_truncate_identifier(const char *ident, int len,
 										  bool warn);
 
@@ -24,4 +26,8 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern void check_unicode_value(pg_wchar c);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..778ef6e696 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,30 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unistr('\0441\043B\043E\043D');
+ unistr 
+--------
+ слон
+(1 row)
+
+SELECT unistr('d\u0061t\U00000061');
+ unistr 
+--------
+ data
+(1 row)
+
+-- run-time error
+SELECT unistr('wrong: \db99');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \db99\0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \+00db99\+000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \+2FFFFF');
+ERROR:  invalid Unicode escape value
+SELECT unistr('wrong: \udb99\u0061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \U0000db99\U00000061');
+ERROR:  invalid Unicode surrogate pair
+SELECT unistr('wrong: \U002FFFFF');
+ERROR:  invalid Unicode escape value
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..546e85f8cd 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,15 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unistr('\0441\043B\043E\043D');
+SELECT unistr('d\u0061t\U00000061');
+
+-- run-time error
+SELECT unistr('wrong: \db99');
+SELECT unistr('wrong: \db99\0061');
+SELECT unistr('wrong: \+00db99\+000061');
+SELECT unistr('wrong: \+2FFFFF');
+SELECT unistr('wrong: \udb99\u0061');
+SELECT unistr('wrong: \U0000db99\U00000061');
+SELECT unistr('wrong: \U002FFFFF');

#25

Bruce Momjian

bruce@momjian.us

about 5 years ago

In reply to: Pavel Stehule (#24)

Re: proposal: unescape_text function

On Wed, Dec 2, 2020 at 07:30:39PM +0100, Pavel Stehule wrote:

postgres=# select
'Arabic : ' || unistr( '\0627\0644\0639\0631\0628\064A\0629' ) || '
Chinese : ' || unistr( '\4E2D\6587' ) || '
English : ' || unistr( 'English' ) || '
French : ' || unistr( 'Fran\00E7ais' ) || '
German : ' || unistr( 'Deutsch' ) || '
Greek : ' || unistr( '\0395\03BB\03BB\03B7\03BD\03B9\03BA\03AC' ) || '
Hebrew : ' || unistr( '\05E2\05D1\05E8\05D9\05EA' ) || '
Japanese : ' || unistr( '\65E5\672C\8A9E' ) || '
Korean : ' || unistr( '\D55C\AD6D\C5B4' ) || '
Portuguese : ' || unistr( 'Portugu\00EAs' ) || '
Russian : ' || unistr( '\0420\0443\0441\0441\043A\0438\0439' ) || '
Spanish : ' || unistr( 'Espa\00F1ol' ) || '
Thai : ' || unistr( '\0E44\0E17\0E22' )
as unicode_test_string;
┌──────────────────────────┐
│ unicode_test_string │
╞══════════════════════════╡
│ Arabic : العربية ↵│
│ Chinese : 中文 ↵│
│ English : English ↵│
│ French : Français ↵│
│ German : Deutsch ↵│
│ Greek : Ελληνικά ↵│
│ Hebrew : עברית ↵│
│ Japanese : 日本語 ↵│
│ Korean : 한국어 ↵│
│ Portuguese : Português↵│
│ Russian : Русский ↵│
│ Spanish : Español ↵│
│ Thai : ไทย │
└──────────────────────────┘

Offlist, this table output is super-cool!

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EnterpriseDB https://enterprisedb.com

The usefulness of a cup is in its emptiness, Bruce Lee

#26

David Steele

david@pgmasters.net

almost 5 years ago

In reply to: Pavel Stehule (#24)

Re: proposal: unescape_text function

On 12/2/20 1:30 PM, Pavel Stehule wrote:

st 2. 12. 2020 v 11:37 odesílatel Pavel Stehule <pavel.stehule@gmail.com
st 2. 12. 2020 v 9:23 odesílatel Peter Eisentraut

Heh. The fact that there is a table of two dozen possible
representations kind of proves my point that we should be
deliberate in
picking one.

I do see Oracle unistr() on that list, which appears to be very
similar
to what you are trying to do here. Maybe look into aligning
with that.

unistr is a primitive form of proposed function. But it can be used
as a base. The format is compatible with our "4.1.2.3. String
Constants with Unicode Escapes".

What do you think about the following proposal?

1. unistr(text) .. compatible with Postgres unicode escapes - it is
enhanced against Oracle, because Oracle's unistr doesn't support 6
digits unicodes.

2. there can be optional parameter "prefix" with default "\". But
with "\u" it can be compatible with Java or Python.

What do you think about it?

I thought about it a little bit more, and the prefix specification has
not too much sense (more if we implement this functionality as function
"unistr"). I removed the optional argument and renamed the function to
"unistr". The functionality is the same. Now it supports Oracle
convention, Java and Python (for Python UXXXXXXXX) and \+XXXXXX. These
formats was already supported.The compatibility witth Oracle is nice.

Peter, it looks like Pavel has aligned this function with unistr() as
you suggested. Thoughts?

Regards,
--
-David
david@pgmasters.net

#27

Peter Eisentraut

peter.eisentraut@enterprisedb.com

almost 5 years ago

In reply to: David Steele (#26)

Re: proposal: unescape_text function

On 10.03.21 14:52, David Steele wrote:

I thought about it a little bit more, and the prefix specification
has not too much sense (more if we implement this functionality as
function "unistr"). I removed the optional argument and renamed the
function to "unistr". The functionality is the same. Now it supports
Oracle convention, Java and Python (for Python UXXXXXXXX) and
\+XXXXXX. These formats was already supported.The compatibility witth
Oracle is nice.

Peter, it looks like Pavel has aligned this function with unistr() as
you suggested. Thoughts?

I haven't read through the patch in detail yet, but I support the
proposed details of the functionality.

#28

Peter Eisentraut

peter.eisentraut@enterprisedb.com

almost 5 years ago

In reply to: Peter Eisentraut (#27)

Re: proposal: unescape_text function

On 25.03.21 10:44, Peter Eisentraut wrote:

On 10.03.21 14:52, David Steele wrote:

I thought about it a little bit more, and the prefix specification
has not too much sense (more if we implement this functionality as
function "unistr"). I removed the optional argument and renamed the
function to "unistr". The functionality is the same. Now it supports
Oracle convention, Java and Python (for Python UXXXXXXXX) and
\+XXXXXX. These formats was already supported.The compatibility witth
Oracle is nice.

Peter, it looks like Pavel has aligned this function with unistr() as
you suggested. Thoughts?

I haven't read through the patch in detail yet, but I support the
proposed details of the functionality.

Committed.

I made two major changes: I moved the tests from unicode.sql to
strings.sql. The first file is for tests that only work in UTF8
encoding, which is not the case here. Also, I wasn't comfortable with
exposing little utility functions from the parser in an ad hoc way. So
I made local copies, which also allows us to make more
locally-appropriate error messages. I think there is some potential for
refactoring here (see also src/common/hex.c), but that's perhaps better
done separately and more comprehensively.

#29

Pavel Stehule

pavel.stehule@gmail.com

almost 5 years ago

In reply to: Peter Eisentraut (#28)

Re: proposal: unescape_text function

po 29. 3. 2021 v 12:19 odesílatel Peter Eisentraut <
peter.eisentraut@enterprisedb.com> napsal:

On 25.03.21 10:44, Peter Eisentraut wrote:

On 10.03.21 14:52, David Steele wrote:

I thought about it a little bit more, and the prefix specification
has not too much sense (more if we implement this functionality as
function "unistr"). I removed the optional argument and renamed the
function to "unistr". The functionality is the same. Now it supports
Oracle convention, Java and Python (for Python UXXXXXXXX) and
\+XXXXXX. These formats was already supported.The compatibility witth
Oracle is nice.

Peter, it looks like Pavel has aligned this function with unistr() as
you suggested. Thoughts?

I haven't read through the patch in detail yet, but I support the
proposed details of the functionality.

Committed.

I made two major changes: I moved the tests from unicode.sql to
strings.sql. The first file is for tests that only work in UTF8
encoding, which is not the case here. Also, I wasn't comfortable with
exposing little utility functions from the parser in an ad hoc way. So
I made local copies, which also allows us to make more
locally-appropriate error messages. I think there is some potential for
refactoring here (see also src/common/hex.c), but that's perhaps better
done separately and more comprehensively.

Thank you very much

Pavel