PATCH: Add uri percent-encoding for binary data

Started by Anders Åstrandover 6 years ago13 messages

anders@449.se

over 6 years ago

1 attachment(s)

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

Regards,
Anders

Attachments:

uri-encoding-v1.patchtext/x-patch; charset=US-ASCII; name=uri-encoding-v1.patchDownload

diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 7293d66de5..33cf7bb57c 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
 	return len;
 }
 
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
+ */
+
+static const char upper_hex_digits[] = "0123456789ABCDEF";
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+	char		*d = dst;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			*d++ = *s;
+		}
+		else
+		{
+			*d++ = '%';
+			*d++ = upper_hex_digits[(*s >> 4) & 0xF];
+			*d++ = upper_hex_digits[*s & 0xF];
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	char		*d = dst;
+	char		val;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				/* This will never get triggered since uri_dec_len already takes care of validation
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+
+			/* Skip '%' */
+			s++;
+
+			val = get_hex(*s++) << 4;
+			val += get_hex(*s++);
+			*d++ = val;
+		}
+		else
+		{
+			*d++ = *s++;
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+	int			len = 0;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			len++;
+		}
+		else
+		{
+			len += 3;
+		}
+	}
+	return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	int			len = 0;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+			s++;
+			get_hex(*s++);
+			get_hex(*s++);
+		}
+		else {
+			s++;
+		}
+		len++;
+	}
+	return len;
+}
+
 /*
  * Common
  */
@@ -541,6 +666,12 @@ static const struct
 			esc_enc_len, esc_dec_len, esc_encode, esc_decode
 		}
 	},
+	{
+		"uri",
+		{
+			uri_enc_len, uri_dec_len, uri_encode, uri_decode
+		}
+	},
 	{
 		NULL,
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 2483966576..f89c5ec1c3 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+  encode   
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+   decode   
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR:  invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index b5e75c344f..1d03836b6e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');

Bruce Momjian

bruce@momjian.us

over 6 years ago

In reply to: Anders Åstrand (#1)

Re: PATCH: Add uri percent-encoding for binary data

On Mon, Oct 7, 2019 at 09:14:38AM +0200, Anders ï¿½strand wrote:

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Oh, that's a cool idea. Can you add it to the commit-fest?

https://commitfest.postgresql.org/25/

---------------------------------------------------------------------------

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

Regards,
Anders

diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 7293d66de5..33cf7bb57c 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
return len;
}

+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
+ */
+
+static const char upper_hex_digits[] = "0123456789ABCDEF";
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+	char		*d = dst;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			*d++ = *s;
+		}
+		else
+		{
+			*d++ = '%';
+			*d++ = upper_hex_digits[(*s >> 4) & 0xF];
+			*d++ = upper_hex_digits[*s & 0xF];
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	char		*d = dst;
+	char		val;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				/* This will never get triggered since uri_dec_len already takes care of validation
+				 */
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+
+			/* Skip '%' */
+			s++;
+
+			val = get_hex(*s++) << 4;
+			val += get_hex(*s++);
+			*d++ = val;
+		}
+		else
+		{
+			*d++ = *s++;
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+	int			len = 0;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			len++;
+		}
+		else
+		{
+			len += 3;
+		}
+	}
+	return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	int			len = 0;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3) {
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			}
+			s++;
+			get_hex(*s++);
+			get_hex(*s++);
+		}
+		else {
+			s++;
+		}
+		len++;
+	}
+	return len;
+}
+
/*
* Common
*/
@@ -541,6 +666,12 @@ static const struct
esc_enc_len, esc_dec_len, esc_encode, esc_decode
}
},
+	{
+		"uri",
+		{
+			uri_enc_len, uri_dec_len, uri_encode, uri_decode
+		}
+	},
{
NULL,
{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 2483966576..f89c5ec1c3 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)

+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+  encode   
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+   decode   
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR:  invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index b5e75c344f..1d03836b6e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ As you are, so once was I.  As I am, so you will be. +
+                      Ancient Roman grave inscription +

Isaac Morland

isaac.morland@gmail.com

over 6 years ago

In reply to: Anders Åstrand (#1)

Re: PATCH: Add uri percent-encoding for binary data

On Mon, 7 Oct 2019 at 03:15, Anders Åstrand <anders@449.se> wrote:

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

This seems like a useful idea to me. I've used the equivalent in Python and
it provides more options:

https://docs.python.org/3/library/urllib.parse.html#url-quoting

I suggest reviewing that documentation there, because there are a few
details that need to be checked carefully. Whether or not space should be
encoded as plus and whether certain byte values should be exempt from
%-encoding is something that depends on the application. Unfortunately, as
far as I can tell there isn't a single version of URL encoding that
satisfies all situations (thus explaining the complexity of the Python
implementation). It might be feasible to suppress some of the Python
options (I'm wondering about the safe= parameter) but I'm pretty sure you
at least need the equivalent of quote and quote_plus.

Anders Åstrand

anders@449.se

over 6 years ago

In reply to: Bruce Momjian (#2)

Re: PATCH: Add uri percent-encoding for binary data

On Mon, Oct 7, 2019 at 9:52 PM Bruce Momjian <bruce@momjian.us> wrote:

On Mon, Oct 7, 2019 at 09:14:38AM +0200, Anders Åstrand wrote:

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Oh, that's a cool idea. Can you add it to the commit-fest?

https://commitfest.postgresql.org/25/

Thanks for your reply! I added it but was unsure of what topic was
appropriate and couldn't find a description of them anywhere. I went
with Miscellaneous for now.

Anders Åstrand

anders@449.se

over 6 years ago

In reply to: Isaac Morland (#3)

Re: PATCH: Add uri percent-encoding for binary data

On Mon, Oct 7, 2019 at 11:38 PM Isaac Morland <isaac.morland@gmail.com> wrote:

On Mon, 7 Oct 2019 at 03:15, Anders Åstrand <anders@449.se> wrote:

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.

In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.

This seems like a useful idea to me. I've used the equivalent in Python and it provides more options:

https://docs.python.org/3/library/urllib.parse.html#url-quoting

I suggest reviewing that documentation there, because there are a few details that need to be checked carefully. Whether or not space should be encoded as plus and whether certain byte values should be exempt from %-encoding is something that depends on the application. Unfortunately, as far as I can tell there isn't a single version of URL encoding that satisfies all situations (thus explaining the complexity of the Python implementation). It might be feasible to suppress some of the Python options (I'm wondering about the safe= parameter) but I'm pretty sure you at least need the equivalent of quote and quote_plus.

Thanks a lot for your reply!

I agree that some (but not all) of the options available to that
python lib could be helpful for developers wanting to build urls
without having to encode the separate parts of it and stitching it
together, but not necessary for this patch to be useful. For generic
uri encoding the slash (/) must be percent encoded, because it has
special meaning in the standard. Some other extra characters may
appear unencoded though depending on context, but it's generally safer
to just encode them all and not hope that the encoder will know about
the context and skip over certain characters.

This does bring up an interesting point however. Maybe decode should
validate that only characters that are allowed unencoded appear in the
input?

Luckily, the plus-encoding of spaces are not part of the uri standard
at all but instead part of the format referred to as
application/x-www-form-urlencoded data. Fortunately that format is
close to dying now that forms more often post json.

Regards,
Anders

Arthur Zakirov

zaartur@gmail.com

about 6 years ago

In reply to: Anders Åstrand (#1)

Re: PATCH: Add uri percent-encoding for binary data

Hello,

On 2019/10/07 16:14, Anders Åstrand wrote:

Hello

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Thank you for the patch. I'm not very familiar with rfc3986. Is it
insist that an output should have upper case characters? If not maybe it
is good to reuse hextbl[] (which is in encode.c) instead of adding new
upper_hex_digits[].

Also can you correct the documentation. encode() is mentioned here:
https://www.postgresql.org/docs/current/functions-binarystring.html

--
Arthur

Alvaro Herrera

alvherre@2ndquadrant.com

almost 6 years ago

In reply to: Anders Åstrand (#1)

1 attachment(s)

Re: PATCH: Add uri percent-encoding for binary data

On 2019-Oct-07, Anders ï¿½strand wrote:

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost
ready to me; however, documentation is missing. I added a stub; can you
please complete that?

To answer Arthur Zakirov's question: yes, the standard recommends
("should") to use uppercase characters:

: For consistency, URI producers and
: normalizers should use uppercase hexadecimal digits for all percent-
: encodings.

Thanks,

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachments:

v2-0001-URI-encode.patchtext/x-diff; charset=iso-8859-1Download

From 44475f709762ba1a2a881d20345cc6a4cb086f01 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 20 Feb 2020 18:46:15 -0300
Subject: [PATCH v2] URI encode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Anders Ãstrand
Discussion: https://postgr.es/m/APwPebtwJnjjt=eUUsmL1zZ6w3jVnA1cVjEzhBoucCYTJc9D2w@mail.gmail.com
---
 doc/src/sgml/func.sgml                |  16 +++-
 src/backend/utils/adt/encode.c        | 129 ++++++++++++++++++++++++++
 src/test/regress/expected/strings.out |  21 +++++
 src/test/regress/sql/strings.sql      |   7 ++
 4 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ceda48e0fc..c60ad4f4e2 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3180,7 +3180,8 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        <parameter>format</parameter> values are:
        <link linkend="encode-format-base64"><literal>base64</literal></link>,
        <link linkend="encode-format-escape"><literal>escape</literal></link>,
-       <link linkend="encode-format-hex"><literal>hex</literal></link>
+       <link linkend="encode-format-hex"><literal>hex</literal></link>,
+       <link linkend="encode-format-uri"><literal>uri</literal></link>
       </entry>
       <entry><literal>encode('123\000\001', 'base64')</literal></entry>
       <entry><literal>MTIzAAE=</literal></entry>
@@ -3274,6 +3275,19 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
       </para>
      </listitem>
     </varlistentry>
+
+    <varlistentry id="encode-format-uri">
+     <term>uri
+     <indexterm>
+      <primary>uri format</primary>
+     </indexterm></term>
+     <listitem>
+      <para>
+       The <literal>uri</literal> format represents ...
+      </para>
+     </listitem>
+    </varlistentry>
+
    </variablelist>
   </para>
 
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index b8d9ec7e00..81d4ea8400 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -110,6 +110,7 @@ binary_decode(PG_FUNCTION_ARGS)
  */
 
 static const char hextbl[] = "0123456789abcdef";
+static const char hextbl_upper[] = "0123456789ABCDEF";
 
 static const int8 hexlookup[128] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -512,6 +513,128 @@ esc_dec_len(const char *src, unsigned srclen)
 	return len;
 }
 
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as
+ * per RFC3986.
+ */
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+	char	   *d = dst;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		/*
+		 * RFC3986:
+		 *
+		 * unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+		 */
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '.' ||
+			*s == '_' ||
+			*s == '~')
+		{
+			*d++ = *s;
+		}
+		else
+		{
+			*d++ = '%';
+			*d++ = hextbl_upper[(*s >> 4) & 0xF];
+			*d++ = hextbl_upper[*s & 0xF];
+		}
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	char	   *d = dst;
+	char		val;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			/*
+			 * Verify we have the needed bytes.  This doesn't happen, since
+			 * uri_dec_len already takes care of validation.
+			 */
+			if (s > srcend - 3)
+				elog(ERROR, "invalid uri percent encoding");
+
+			/* Skip '%' */
+			s++;
+
+			val = get_hex(*s++) << 4;
+			val += get_hex(*s++);
+			*d++ = val;
+		}
+		else
+			*d++ = *s++;
+	}
+	return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+	int			len = 0;
+
+	for (const char *s = src; s < src + srclen; s++)
+	{
+		if ((*s >= 'A' && *s <= 'Z') ||
+			(*s >= 'a' && *s <= 'z') ||
+			(*s >= '0' && *s <= '9') ||
+			*s == '-' ||
+			*s == '_' ||
+			*s == '.' ||
+			*s == '~')
+		{
+			len++;
+		}
+		else
+			len += 3;
+	}
+	return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+	const char *s = src;
+	const char *srcend = src + srclen;
+	int			len = 0;
+
+	while (s < srcend)
+	{
+		if (*s == '%')
+		{
+			if (s > srcend - 3)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid uri percent encoding"),
+						 errhint("Input data ends prematurely.")));
+			s++;
+			get_hex(*s++);
+			get_hex(*s++);
+		}
+		else
+			s++;
+		len++;
+	}
+
+	return len;
+}
+
 /*
  * Common
  */
@@ -541,6 +664,12 @@ static const struct
 			esc_enc_len, esc_dec_len, esc_encode, esc_decode
 		}
 	},
+	{
+		"uri",
+		{
+			uri_enc_len, uri_dec_len, uri_encode, uri_decode
+		}
+	},
 	{
 		NULL,
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 60cb86193c..a79ef6ac10 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1892,3 +1892,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+  encode   
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+   decode   
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR:  invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR:  invalid uri percent encoding
+HINT:  Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index c5cd15142a..8a7b103681 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -648,3 +648,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');
-- 
2.20.1

Anders Åstrand

anders@449.se

almost 6 years ago

In reply to: Alvaro Herrera (#7)

Re: PATCH: Add uri percent-encoding for binary data

Thanks for keeping this alive even though I disappeared after submitting it!

I can write documentation this weekend.

Thanks again.
//Anders

On Thu, 20 Feb 2020, 23:28 Alvaro Herrera, <alvherre@2ndquadrant.com> wrote:

Show quoted text

On 2019-Oct-07, Anders Åstrand wrote:

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost
ready to me; however, documentation is missing. I added a stub; can you
please complete that?

To answer Arthur Zakirov's question: yes, the standard recommends
("should") to use uppercase characters:

: For consistency, URI producers and
: normalizers should use uppercase hexadecimal digits for all percent-
: encodings.

Thanks,

--
Álvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Daniel Gustafsson

daniel@yesql.se

almost 6 years ago

In reply to: Alvaro Herrera (#7)

Re: PATCH: Add uri percent-encoding for binary data

On 20 Feb 2020, at 23:27, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:

On 2019-Oct-07, Anders Åstrand wrote:

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost
ready to me;

I agree that uri decoding/encoding would be useful, but I'm not convinced that
this patch does the functionality justice enough to be useful. What is the
usecase we envision to solve when not taking scheme into consideration?

Reserved characters have different meaning based on context and scheme, and
should not be encoded when used as a delimiter. This does make the patch a lot
more complicated, but if we provide a uri encoding which percent-encode the
delimiters in https:// I would expect that to be reported to pgsql-bugs@
repeatedly. Adding URIs with userinfo makes it even more problematic, as
encoding the @ delimiter will break it.

Further, RFC6874 specifies that ipv6 URIs with zone identifiers are written as:
IPv6address "%25" ZoneID. With this patch it would be encoded %2525 ZoneID
which is incorrect.

That being said, if we do look at the scheme then we'll need to decide which
URI standard we want to stick to as RFC3986 and WHATWG URL-spec aren't
compatible.

Perhaps not calling it 'uri' and instead renaming it to 'percent-encoding' can
make it clearer, while sticking to the proposed feature?

cheers ./daniel

#10

Daniel Gustafsson

daniel@yesql.se

almost 6 years ago

In reply to: Daniel Gustafsson (#9)

Re: PATCH: Add uri percent-encoding for binary data

On 4 Mar 2020, at 12:25, Daniel Gustafsson <daniel@yesql.se> wrote:

On 20 Feb 2020, at 23:27, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:

On 2019-Oct-07, Anders Åstrand wrote:

Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).

Thanks. Seems useful. I made a few cosmetic tweaks and it looks almost
ready to me;

I agree that uri decoding/encoding would be useful, but I'm not convinced that
this patch does the functionality justice enough to be useful. What is the
usecase we envision to solve when not taking scheme into consideration?

Reserved characters have different meaning based on context and scheme, and
should not be encoded when used as a delimiter. This does make the patch a lot
more complicated, but if we provide a uri encoding which percent-encode the
delimiters in https:// I would expect that to be reported to pgsql-bugs@
repeatedly. Adding URIs with userinfo makes it even more problematic, as
encoding the @ delimiter will break it.

Further, RFC6874 specifies that ipv6 URIs with zone identifiers are written as:
IPv6address "%25" ZoneID. With this patch it would be encoded %2525 ZoneID
which is incorrect.

That being said, if we do look at the scheme then we'll need to decide which
URI standard we want to stick to as RFC3986 and WHATWG URL-spec aren't
compatible.

Perhaps not calling it 'uri' and instead renaming it to 'percent-encoding' can
make it clearer, while sticking to the proposed feature?

With no response for 2 weeks during the commitfest, I propose to move this to
the next CF to allow time for discussions.

cheers ./daniel

#11

Daniel Gustafsson

daniel@yesql.se

over 5 years ago

In reply to: Daniel Gustafsson (#10)

Re: PATCH: Add uri percent-encoding for binary data

On 19 Mar 2020, at 08:55, Daniel Gustafsson <daniel@yesql.se> wrote:

With no response for 2 weeks during the commitfest, I propose to move this to
the next CF to allow time for discussions.

This patch no longer applies, the failing hunk being in the docs part. As
stated in my review earlier in the thread I don't think this feature is
complete enough in its current form; having hacked on it a bit, what are your
thoughts Alvaro?

Marking as Waiting on Author for now.

cheers ./daniel

#12

Alvaro Herrera

alvherre@2ndquadrant.com

over 5 years ago

In reply to: Daniel Gustafsson (#11)

Re: PATCH: Add uri percent-encoding for binary data

On 2020-Jul-01, Daniel Gustafsson wrote:

On 19 Mar 2020, at 08:55, Daniel Gustafsson <daniel@yesql.se> wrote:

With no response for 2 weeks during the commitfest, I propose to move this to
the next CF to allow time for discussions.

This patch no longer applies, the failing hunk being in the docs part. As
stated in my review earlier in the thread I don't think this feature is
complete enough in its current form; having hacked on it a bit, what are your
thoughts Alvaro?

If the author (or some other person interested in the feature) submits a
version addressing the feedback, by all means let's consider it further;
but if nothing happens during this commitfest, I'd say we close as RwF
at end of July.

--
ï¿½lvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

#13

Daniel Gustafsson

daniel@yesql.se

over 5 years ago

In reply to: Alvaro Herrera (#12)

Re: PATCH: Add uri percent-encoding for binary data

On 1 Jul 2020, at 16:58, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:

On 2020-Jul-01, Daniel Gustafsson wrote:

On 19 Mar 2020, at 08:55, Daniel Gustafsson <daniel@yesql.se> wrote:

With no response for 2 weeks during the commitfest, I propose to move this to
the next CF to allow time for discussions.

This patch no longer applies, the failing hunk being in the docs part. As
stated in my review earlier in the thread I don't think this feature is
complete enough in its current form; having hacked on it a bit, what are your
thoughts Alvaro?

If the author (or some other person interested in the feature) submits a
version addressing the feedback, by all means let's consider it further;
but if nothing happens during this commitfest, I'd say we close as RwF
at end of July.

As per discussion, this entry is closed as "Returned with Feedback".

cheers ./daniel