[PATCH] Add crc32(text) & crc32(bytea)
Hi,
While answering one of the recent questions [1]/messages/by-id/CAJ7c6TOurV4uA5Yz=aJ-ae4czL_zdFNqxbu47eyVrYFefrWoog@mail.gmail.com I wanted to use
crc32(text) and discovered that it's missing out-of-the box. Of
course, one can use `substr(md5(x), 1, 8)` with almost the same effect
but it's less convenient and could be slower (I didn't do actual
benchmarks though). Also it's incompatible with third-party software
that may calculate crc32's and store the results in PostgreSQL.
I vaguely recall that I faced this problem before. Supporting crc32
was requested on the mailing list [2]/messages/by-id/auto-000557707157@umail.ru and a number of workarounds
exist in PL/pgSQL [3]https://stackoverflow.com/questions/28179335/crc32-function-with-pl-pgsql[4]https://gist.github.com/cuber/bcf0a3a96fc9a790d96d. Since there seems to be a demand and it
costs us nothing to maintain crc32() I suggest adding it.
The proposed patch exposes our internal crc32 implementation to the
user. I chose to return a hex string similarly to md5(). In my humble
experience this is most convenient in practical use. However if the
majority believes that the function should return a bigint (in order
to fit an unsigned int32) or a bytea (as SHA* functions do), I'm fine
with whatever consensus the community reaches.
[1]: /messages/by-id/CAJ7c6TOurV4uA5Yz=aJ-ae4czL_zdFNqxbu47eyVrYFefrWoog@mail.gmail.com
[2]: /messages/by-id/auto-000557707157@umail.ru
[3]: https://stackoverflow.com/questions/28179335/crc32-function-with-pl-pgsql
[4]: https://gist.github.com/cuber/bcf0a3a96fc9a790d96d
--
Best regards,
Aleksander Alekseev
Attachments:
v1-0001-Add-crc32-text-crc32-bytea.patchapplication/octet-stream; name=v1-0001-Add-crc32-text-crc32-bytea.patchDownload
From 185072d5c64f6d252db1d3440f22dcca7ba2b424 Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <aleksander@timescale.com>
Date: Thu, 18 Jul 2024 12:59:40 +0300
Subject: [PATCH v1] Add crc32(text) & crc32(bytea)
Per several user requests.
Aleksander Alekseev, reviewed by TODO FIXME
Discussion: TODO FIXME
---
doc/src/sgml/func.sgml | 36 ++++++++++
src/backend/utils/adt/Makefile | 1 +
src/backend/utils/adt/hashfuncs.c | 90 ++++++++++++++++++++++++
src/backend/utils/adt/meson.build | 1 +
src/include/catalog/pg_proc.dat | 8 +++
src/test/regress/expected/opr_sanity.out | 2 +
src/test/regress/expected/strings.out | 27 +++++++
src/test/regress/sql/strings.sql | 10 +++
8 files changed, 175 insertions(+)
create mode 100644 src/backend/utils/adt/hashfuncs.c
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 3f93c61aa3..7943b1ee2a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3030,6 +3030,24 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>text</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32 <link linkend="functions-hash-note">hash</link> of
+ the argument, with the result written in hexadecimal.
+ </para>
+ <para>
+ <literal>crc32('PostgreSQL')</literal>
+ <returnvalue>cb97b83b</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4484,6 +4502,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>bytea</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32 <link linkend="functions-hash-note">hash</link> of
+ the binary string, with the result written in hexadecimal.
+ </para>
+ <para>
+ <literal>crc32('PostgreSQL' :: bytea)</literal>
+ <returnvalue>cb97b83b</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index edb09d4e35..be84d68856 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -42,6 +42,7 @@ OBJS = \
geo_ops.o \
geo_selfuncs.o \
geo_spgist.o \
+ hashfuncs.o \
hbafuncs.o \
inet_cidr_ntop.o \
inet_net_pton.o \
diff --git a/src/backend/utils/adt/hashfuncs.c b/src/backend/utils/adt/hashfuncs.c
new file mode 100644
index 0000000000..13a5fe55fc
--- /dev/null
+++ b/src/backend/utils/adt/hashfuncs.c
@@ -0,0 +1,90 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashfuncs.c
+ * Non-cryptographic hash functions
+ *
+ * Portions Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/hashfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "utils/builtins.h"
+#include "utils/pg_crc.h"
+#include "varatt.h"
+
+/*
+ * Calculate CRC32 of the given data.
+ *
+ * Common code for crc32_text() and crc32_bytea().
+ */
+static unsigned int
+crc32_sz(const char *buf, int size)
+{
+ pg_crc32 crc;
+ const char *p = buf;
+
+ INIT_TRADITIONAL_CRC32(crc);
+ while (size > 0)
+ {
+ char c = (char) (*p);
+
+ COMP_TRADITIONAL_CRC32(crc, &c, 1);
+ size--;
+ p++;
+ }
+ FIN_TRADITIONAL_CRC32(crc);
+ return (unsigned int) crc;
+}
+
+/*
+ * Create a CRC32 hash of a text value and return it as hex string.
+ */
+Datum
+crc32_text(PG_FUNCTION_ARGS)
+{
+ text *in_text = PG_GETARG_TEXT_PP(0);
+ size_t len;
+ unsigned int hashsum;
+ char result[16];
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in_text);
+
+ /* get the hash result */
+ hashsum = crc32_sz(VARDATA_ANY(in_text), len);
+
+ /* format the hex string */
+ snprintf(result, sizeof(result), "%08x", hashsum);
+
+ /* convert to text and return it */
+ PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/*
+ * Create a CRC32 hash of a bytea value and return it as a hex string.
+ */
+Datum
+crc32_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ unsigned int hashsum;
+ char result[16];
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the hash result */
+ hashsum = crc32_sz(VARDATA_ANY(in), len);
+
+ /* format the hex string */
+ snprintf(result, sizeof(result), "%08x", hashsum);
+
+ /* convert to text and return it */
+ PG_RETURN_TEXT_P(cstring_to_text(result));
+}
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8c6fc80c37..6239ea7b42 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -31,6 +31,7 @@ backend_sources += files(
'geo_ops.c',
'geo_selfuncs.c',
'geo_spgist.c',
+ 'hashfuncs.c',
'hbafuncs.c',
'inet_cidr_ntop.c',
'inet_net_pton.c',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 73d9cf8582..0f79998bba 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7704,6 +7704,14 @@
proname => 'system', provolatile => 'v', prorettype => 'tsm_handler',
proargtypes => 'internal', prosrc => 'tsm_system_handler' },
+# non-cryptographic
+{ oid => '8571', descr => 'CRC32 hash',
+ proname => 'crc32', proleakproof => 't', prorettype => 'text',
+ proargtypes => 'text', prosrc => 'crc32_text' },
+{ oid => '8572', descr => 'CRC32 hash',
+ proname => 'crc32', proleakproof => 't', prorettype => 'text',
+ proargtypes => 'bytea', prosrc => 'crc32_bytea' },
+
# cryptographic
{ oid => '2311', descr => 'MD5 hash',
proname => 'md5', proleakproof => 't', prorettype => 'text',
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9d047b21b8..e170a9bb1b 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -874,6 +874,8 @@ xid8ne(xid8,xid8)
xid8cmp(xid8,xid8)
uuid_extract_timestamp(uuid)
uuid_extract_version(uuid)
+crc32(text)
+crc32(bytea)
-- restore normal output mode
\a\t
-- List of functions used by libpq's fe-lobj.c
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 52b69a107f..de77070038 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2203,6 +2203,33 @@ select to_hex(256::bigint*256::bigint*256::bigint*256::bigint - 1) AS "ffffffff"
ffffffff
(1 row)
+--
+-- CRC32
+--
+SELECT crc32('');
+ crc32
+----------
+ 00000000
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.');
+ crc32
+----------
+ 519025e9
+(1 row)
+
+SELECT crc32('' :: bytea);
+ crc32
+----------
+ 00000000
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.' :: bytea);
+ crc32
+----------
+ 519025e9
+(1 row)
+
--
-- SHA-2
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3959678992..93149d170a 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -702,6 +702,16 @@ select to_hex(-1234::bigint) AS "fffffffffffffb2e";
select to_hex(256*256*256 - 1) AS "ffffff";
select to_hex(256::bigint*256::bigint*256::bigint*256::bigint - 1) AS "ffffffff";
+--
+-- CRC32
+--
+
+SELECT crc32('');
+SELECT crc32('The quick brown fox jumps over the lazy dog.');
+
+SELECT crc32('' :: bytea);
+SELECT crc32('The quick brown fox jumps over the lazy dog.' :: bytea);
+
--
-- SHA-2
--
--
2.45.2
On Thu, Jul 18, 2024 at 02:24:23PM +0300, Aleksander Alekseev wrote:
I vaguely recall that I faced this problem before. Supporting crc32
was requested on the mailing list [2] and a number of workarounds
exist in PL/pgSQL [3][4]. Since there seems to be a demand and it
costs us nothing to maintain crc32() I suggest adding it.
This sounds generally reasonable to me, especially given the apparent
demand. Should we also introduce crc32c() while we're at it?
--
nathan
Hi,
This sounds generally reasonable to me, especially given the apparent
demand. Should we also introduce crc32c() while we're at it?
Might be a good idea. However I didn't see a demand for crc32c() SQL
function yet. Also I'm not sure whether the best interface for it
would be crc32c() or crc32(x, version='c') or perhaps crc32(x,
polinomial=...). I propose keeping the scope small this time.
--
Best regards,
Aleksander Alekseev
On Fri, Jul 26, 2024 at 12:01:40PM +0300, Aleksander Alekseev wrote:
This sounds generally reasonable to me, especially given the apparent
demand. Should we also introduce crc32c() while we're at it?Might be a good idea. However I didn't see a demand for crc32c() SQL
function yet. Also I'm not sure whether the best interface for it
would be crc32c() or crc32(x, version='c') or perhaps crc32(x,
polinomial=...). I propose keeping the scope small this time.
I don't think adding crc32c() would sufficiently increase the scope. We'd
use the existing implementations for both crc32() and crc32c(). And
besides, this could be useful for adding tests for that code.
+ <function>crc32</function> ( <type>text</type> )
Do we need a version of the function that takes a text input? It's easy
enough to cast to a bytea.
+ <returnvalue>text</returnvalue>
My first reaction is that we should just have this return bytea like the
SHA ones do, if for no other reason than commit 10cfce3 seems intended to
move us away from returning text for these kinds of functions. Upthread,
you mentioned the possibility of returning a bigint, too. I think I'd
still prefer bytea in case we want to add, say, crc64() or crc16() in the
future. That would allow us to keep all of these functions consistent
instead of returning different types for each. However, I understand that
returning the numeric types might be more convenient. I'm curious what
others think about this.
+ Computes the CRC32 <link linkend="functions-hash-note">hash</link> of
+ the binary string, with the result written in hexadecimal.
I'm not sure we should call the check values "hashes." Wikipedia does
include them in the "List of hash functions" page [0]https://en.wikipedia.org/wiki/List_of_hash_functions, but it seems to
deliberately avoid calling them hashes in the CRC page [1]https://en.wikipedia.org/wiki/Cyclic_redundancy_check. I'd suggest
calling them "CRC32 values" instead.
[0]: https://en.wikipedia.org/wiki/List_of_hash_functions
[1]: https://en.wikipedia.org/wiki/Cyclic_redundancy_check
--
nathan
Hi Nathan,
I don't think adding crc32c() would sufficiently increase the scope. We'd
use the existing implementations for both crc32() and crc32c(). And
besides, this could be useful for adding tests for that code.+ <function>crc32</function> ( <type>text</type> )
Do we need a version of the function that takes a text input? It's easy
enough to cast to a bytea.+ <returnvalue>text</returnvalue>
My first reaction is that we should just have this return bytea like the
SHA ones do, if for no other reason than commit 10cfce3 seems intended to
move us away from returning text for these kinds of functions. Upthread,
you mentioned the possibility of returning a bigint, too. I think I'd
still prefer bytea in case we want to add, say, crc64() or crc16() in the
future. That would allow us to keep all of these functions consistent
instead of returning different types for each. However, I understand that
returning the numeric types might be more convenient. I'm curious what
others think about this.+ Computes the CRC32 <link linkend="functions-hash-note">hash</link> of + the binary string, with the result written in hexadecimal.I'm not sure we should call the check values "hashes." Wikipedia does
include them in the "List of hash functions" page [0], but it seems to
deliberately avoid calling them hashes in the CRC page [1]. I'd suggest
calling them "CRC32 values" instead.
Thanks for the code review. Here is the updated patch.
--
Best regards,
Aleksander Alekseev
Attachments:
v2-0001-Add-crc32-bytea-crc32c-bytea.patchapplication/octet-stream; name=v2-0001-Add-crc32-bytea-crc32c-bytea.patchDownload
From ef0e928dbc408f96cc0d23c32095fb102222562e Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <aleksander@timescale.com>
Date: Thu, 18 Jul 2024 12:59:40 +0300
Subject: [PATCH v2] Add crc32(bytea) & crc32c(bytea)
Per user requests.
Aleksander Alekseev, reviewed by Nathan Bossart
Discussion: https://postgr.es/m/CAJ7c6TNMTGnqnG=yXXUQh9E88JDckmR45H2Q+=ucaCLMOW1QQw@mail.gmail.com
---
doc/src/sgml/func.sgml | 34 ++++++++
src/backend/utils/adt/Makefile | 1 +
src/backend/utils/adt/hashfuncs.c | 100 +++++++++++++++++++++++
src/backend/utils/adt/meson.build | 1 +
src/include/catalog/pg_proc.dat | 8 ++
src/test/regress/expected/opr_sanity.out | 2 +
src/test/regress/expected/strings.out | 27 ++++++
src/test/regress/sql/strings.sql | 9 ++
8 files changed, 182 insertions(+)
create mode 100644 src/backend/utils/adt/hashfuncs.c
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b39f97dc8d..a327674f65 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4490,6 +4490,40 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>bytea</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32 value of the binary string.
+ </para>
+ <para>
+ <literal>crc32('PostgreSQL'::bytea)</literal>
+ <returnvalue>\xcb97b83b</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32c</primary>
+ </indexterm>
+ <function>crc32c</function> ( <type>bytea</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32C value of the binary string.
+ </para>
+ <para>
+ <literal>crc32c('PostgreSQL'::bytea)</literal>
+ <returnvalue>\xa7c16abd</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index edb09d4e35..be84d68856 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -42,6 +42,7 @@ OBJS = \
geo_ops.o \
geo_selfuncs.o \
geo_spgist.o \
+ hashfuncs.o \
hbafuncs.o \
inet_cidr_ntop.o \
inet_net_pton.o \
diff --git a/src/backend/utils/adt/hashfuncs.c b/src/backend/utils/adt/hashfuncs.c
new file mode 100644
index 0000000000..7554d04ca4
--- /dev/null
+++ b/src/backend/utils/adt/hashfuncs.c
@@ -0,0 +1,100 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashfuncs.c
+ * Non-cryptographic hash functions
+ *
+ * Portions Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/hashfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "port/pg_crc32c.h"
+#include "utils/builtins.h"
+#include "utils/pg_crc.h"
+#include "varatt.h"
+
+/*
+ * Calculate CRC32 of the given data.
+ */
+static inline pg_crc32
+crc32_sz(const char *buf, int size)
+{
+ pg_crc32 crc;
+ const char *p = buf;
+
+ INIT_TRADITIONAL_CRC32(crc);
+ while (size > 0)
+ {
+ char c = (char) (*p);
+
+ COMP_TRADITIONAL_CRC32(crc, &c, 1);
+ size--;
+ p++;
+ }
+ FIN_TRADITIONAL_CRC32(crc);
+ return crc;
+}
+
+/*
+ * Convert crc value to bytea.
+ *
+ * Common code for crc32_bytea() and crc32c_bytea().
+ */
+static inline bytea *
+crc_to_bytea(uint32 crc)
+{
+ bytea *result;
+
+ result = palloc0(sizeof(uint32) + VARHDRSZ);
+ *(uint32 *) VARDATA(result) = pg_hton32(crc);
+ SET_VARSIZE(result, sizeof(uint32) + VARHDRSZ);
+
+ return result;
+}
+
+/*
+ * Calculate CRC32 of a bytea value and return it as bytea.
+ */
+Datum
+crc32_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32 crc;
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ crc = crc32_sz(VARDATA_ANY(in), len);
+
+ /* return crc value as bytea */
+ PG_RETURN_BYTEA_P(crc_to_bytea((uint32) crc));
+}
+
+/*
+ * Calculate CRC32C of a bytea value and return it as bytea.
+ */
+Datum
+crc32c_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32c crc;
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, VARDATA_ANY(in), len);
+ FIN_CRC32C(crc);
+
+ /* return crc value as bytea */
+ PG_RETURN_BYTEA_P(crc_to_bytea((uint32) crc));
+}
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8c6fc80c37..6239ea7b42 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -31,6 +31,7 @@ backend_sources += files(
'geo_ops.c',
'geo_selfuncs.c',
'geo_spgist.c',
+ 'hashfuncs.c',
'hbafuncs.c',
'inet_cidr_ntop.c',
'inet_net_pton.c',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 06b2f4ba66..1349a696ce 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7731,6 +7731,14 @@
proname => 'system', provolatile => 'v', prorettype => 'tsm_handler',
proargtypes => 'internal', prosrc => 'tsm_system_handler' },
+# non-cryptographic
+{ oid => '8571', descr => 'CRC32 value',
+ proname => 'crc32', proleakproof => 't', prorettype => 'bytea',
+ proargtypes => 'bytea', prosrc => 'crc32_bytea' },
+{ oid => '8572', descr => 'CRC32C value',
+ proname => 'crc32c', proleakproof => 't', prorettype => 'bytea',
+ proargtypes => 'bytea', prosrc => 'crc32c_bytea' },
+
# cryptographic
{ oid => '2311', descr => 'MD5 hash',
proname => 'md5', proleakproof => 't', prorettype => 'text',
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9d047b21b8..0d734169f1 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -874,6 +874,8 @@ xid8ne(xid8,xid8)
xid8cmp(xid8,xid8)
uuid_extract_timestamp(uuid)
uuid_extract_version(uuid)
+crc32(bytea)
+crc32c(bytea)
-- restore normal output mode
\a\t
-- List of functions used by libpq's fe-lobj.c
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 52b69a107f..074b0e11e2 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2255,6 +2255,33 @@ SELECT sha512('The quick brown fox jumps over the lazy dog.');
\x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed
(1 row)
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+ crc32
+------------
+ \x00000000
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32
+------------
+ \x519025e9
+(1 row)
+
+SELECT crc32c(''::bytea);
+ crc32c
+------------
+ \x00000000
+(1 row)
+
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32c
+------------
+ \x190097b3
+(1 row)
+
--
-- encode/decode
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3959678992..f8cd621d9a 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -719,6 +719,15 @@ SELECT sha384('The quick brown fox jumps over the lazy dog.');
SELECT sha512('');
SELECT sha512('The quick brown fox jumps over the lazy dog.');
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+
+SELECT crc32c(''::bytea);
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+
--
-- encode/decode
--
--
2.45.2
+/*
+ * Calculate CRC32 of the given data.
+ */
+static inline pg_crc32
+crc32_sz(const char *buf, int size)
+{
+ pg_crc32 crc;
+ const char *p = buf;
+
+ INIT_TRADITIONAL_CRC32(crc);
+ while (size > 0)
+ {
+ char c = (char) (*p);
+
+ COMP_TRADITIONAL_CRC32(crc, &c, 1);
+ size--;
+ p++;
+ }
+ FIN_TRADITIONAL_CRC32(crc);
+ return crc;
+}
I'm curious why we need to do this instead of only using the macros:
INIT_TRADITIONAL_CRC32(crc);
COMP_TRADITIONAL_CRC32(crc, VARDATA_ANY(in), len);
FIN_TRADITIONAL_CRC32(crc);
+ * IDENTIFICATION
+ * src/backend/utils/adt/hashfuncs.c
Perhaps these would fit into src/backend/utils/hash/pg_crc.c?
--
nathan
Hi,
I'm curious why we need to do this instead of only using the macros:
INIT_TRADITIONAL_CRC32(crc);
COMP_TRADITIONAL_CRC32(crc, VARDATA_ANY(in), len);
FIN_TRADITIONAL_CRC32(crc);+ * IDENTIFICATION
+ * src/backend/utils/adt/hashfuncs.cPerhaps these would fit into src/backend/utils/hash/pg_crc.c?
Thanks, PFA patch v3.
--
Best regards,
Aleksander Alekseev
Attachments:
v3-0001-Add-crc32-bytea-crc32c-bytea.patchapplication/octet-stream; name=v3-0001-Add-crc32-bytea-crc32c-bytea.patchDownload
From 1211f944ec2e32db2e078ee40e713ec3e85f6c91 Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <aleksander@timescale.com>
Date: Thu, 18 Jul 2024 12:59:40 +0300
Subject: [PATCH v3] Add crc32(bytea) & crc32c(bytea)
Per user requests.
Aleksander Alekseev, reviewed by Nathan Bossart
Discussion: https://postgr.es/m/CAJ7c6TNMTGnqnG=yXXUQh9E88JDckmR45H2Q+=ucaCLMOW1QQw@mail.gmail.com
---
doc/src/sgml/func.sgml | 34 +++++++++++++
src/backend/utils/hash/pg_crc.c | 65 ++++++++++++++++++++++++
src/include/catalog/pg_proc.dat | 8 +++
src/test/regress/expected/opr_sanity.out | 2 +
src/test/regress/expected/strings.out | 27 ++++++++++
src/test/regress/sql/strings.sql | 9 ++++
6 files changed, 145 insertions(+)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0f7154b76a..5bef270a9b 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4490,6 +4490,40 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>bytea</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32 value of the binary string.
+ </para>
+ <para>
+ <literal>crc32('PostgreSQL'::bytea)</literal>
+ <returnvalue>\xcb97b83b</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32c</primary>
+ </indexterm>
+ <function>crc32c</function> ( <type>bytea</type> )
+ <returnvalue>bytea</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32C value of the binary string.
+ </para>
+ <para>
+ <literal>crc32c('PostgreSQL'::bytea)</literal>
+ <returnvalue>\xa7c16abd</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/hash/pg_crc.c b/src/backend/utils/hash/pg_crc.c
index 3595938dc4..b6af91399c 100644
--- a/src/backend/utils/hash/pg_crc.c
+++ b/src/backend/utils/hash/pg_crc.c
@@ -18,8 +18,12 @@
*/
#include "c.h"
+#include "postgres.h"
+#include "port/pg_crc32c.h"
+#include "utils/builtins.h"
#include "utils/pg_crc.h"
+#include "varatt.h"
/*
* Lookup table for calculating CRC-32 using Sarwate's algorithm.
@@ -95,3 +99,64 @@ const uint32 pg_crc32_table[256] = {
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};
+
+/*
+ * Convert crc value to bytea.
+ *
+ * Common code for crc32_bytea() and crc32c_bytea().
+ */
+static inline bytea *
+crc_to_bytea(uint32 crc)
+{
+ bytea *result;
+
+ result = palloc0(sizeof(uint32) + VARHDRSZ);
+ *(uint32 *) VARDATA(result) = pg_hton32(crc);
+ SET_VARSIZE(result, sizeof(uint32) + VARHDRSZ);
+
+ return result;
+}
+
+/*
+ * Calculate CRC32 of a bytea value and return it as bytea.
+ */
+Datum
+crc32_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32 crc;
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ INIT_TRADITIONAL_CRC32(crc);
+ COMP_TRADITIONAL_CRC32(crc, VARDATA_ANY(in), len);
+ FIN_TRADITIONAL_CRC32(crc);
+
+ /* return crc value as bytea */
+ PG_RETURN_BYTEA_P(crc_to_bytea((uint32) crc));
+}
+
+/*
+ * Calculate CRC32C of a bytea value and return it as bytea.
+ */
+Datum
+crc32c_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32c crc;
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, VARDATA_ANY(in), len);
+ FIN_CRC32C(crc);
+
+ /* return crc value as bytea */
+ PG_RETURN_BYTEA_P(crc_to_bytea((uint32) crc));
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d36f6001bb..a2971d1551 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7743,6 +7743,14 @@
proname => 'system', provolatile => 'v', prorettype => 'tsm_handler',
proargtypes => 'internal', prosrc => 'tsm_system_handler' },
+# non-cryptographic
+{ oid => '8571', descr => 'CRC32 value',
+ proname => 'crc32', proleakproof => 't', prorettype => 'bytea',
+ proargtypes => 'bytea', prosrc => 'crc32_bytea' },
+{ oid => '8572', descr => 'CRC32C value',
+ proname => 'crc32c', proleakproof => 't', prorettype => 'bytea',
+ proargtypes => 'bytea', prosrc => 'crc32c_bytea' },
+
# cryptographic
{ oid => '2311', descr => 'MD5 hash',
proname => 'md5', proleakproof => 't', prorettype => 'text',
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9d047b21b8..0d734169f1 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -874,6 +874,8 @@ xid8ne(xid8,xid8)
xid8cmp(xid8,xid8)
uuid_extract_timestamp(uuid)
uuid_extract_version(uuid)
+crc32(bytea)
+crc32c(bytea)
-- restore normal output mode
\a\t
-- List of functions used by libpq's fe-lobj.c
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 52b69a107f..074b0e11e2 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2255,6 +2255,33 @@ SELECT sha512('The quick brown fox jumps over the lazy dog.');
\x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed
(1 row)
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+ crc32
+------------
+ \x00000000
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32
+------------
+ \x519025e9
+(1 row)
+
+SELECT crc32c(''::bytea);
+ crc32c
+------------
+ \x00000000
+(1 row)
+
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32c
+------------
+ \x190097b3
+(1 row)
+
--
-- encode/decode
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3959678992..f8cd621d9a 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -719,6 +719,15 @@ SELECT sha384('The quick brown fox jumps over the lazy dog.');
SELECT sha512('');
SELECT sha512('The quick brown fox jumps over the lazy dog.');
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+
+SELECT crc32c(''::bytea);
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+
--
-- encode/decode
--
--
2.45.2
On Mon, Aug 05, 2024 at 04:19:45PM +0300, Aleksander Alekseev wrote:
Thanks, PFA patch v3.
This looks pretty good to me. The only point that I think deserves more
discussion is the return type. Does bytea make the most sense here? Or
should we consider int/bigint?
--
nathan
Hi,
This looks pretty good to me. The only point that I think deserves more
discussion is the return type. Does bytea make the most sense here? Or
should we consider int/bigint?
Personally I would choose BYTEA in order to be consistent with sha*() functions.
It can be casted to TEXT if user wants a result similar to the one
md5() returns:
```
SELECT encode(crc32('PostgreSQL'), 'hex');
```
... and somewhat less convenient to BIGINT:
```
SELECT ((get_byte(crc, 0) :: bigint << 24) | (get_byte(crc, 1) << 16)
| (get_byte(crc, 2) << 8) | get_byte(crc, 3))
FROM (SELECT crc32('PostgreSQL') AS crc);
```
I don't like the `integer` option because crc32 value is typically
considered as an unsigned one and `integer` is not large enough to
represent uint32.
Perhaps we need get_int4() / get_int8() / get_numeric() as there seems
to be a demand [1]https://stackoverflow.com/questions/32944267/postgresql-converting-bytea-to-bigint[2]/messages/by-id/AANLkTikip9xs8iXc8e+Mgz1T1701i8Xk6QtbVB3KJQzX@mail.gmail.com and it will allow us to easily cast a `bytea`
value to `integer` or `bigint`. This is probably another topic though.
[1]: https://stackoverflow.com/questions/32944267/postgresql-converting-bytea-to-bigint
[2]: /messages/by-id/AANLkTikip9xs8iXc8e+Mgz1T1701i8Xk6QtbVB3KJQzX@mail.gmail.com
--
Best regards,
Aleksander Alekseev
On Tue, Aug 06, 2024 at 11:04:41AM +0300, Aleksander Alekseev wrote:
Perhaps we need get_int4() / get_int8() / get_numeric() as there seems
to be a demand [1][2] and it will allow us to easily cast a `bytea`
value to `integer` or `bigint`. This is probably another topic though.
Yeah, I was surprised to learn there wasn't yet an easy way to do this.
I'm not sure how much of a factor this should play in deciding the return
value for the CRC functions, but IMHO it's a reason to reconsider returning
text as you originally proposed.
--
nathan
Hi,
Yeah, I was surprised to learn there wasn't yet an easy way to do this.
I'm not sure how much of a factor this should play in deciding the return
value for the CRC functions, but IMHO it's a reason to reconsider returning
text as you originally proposed.
OK, here is the corrected patch v4.
--
Best regards,
Aleksander Alekseev
Attachments:
v4-0001-Add-crc32-bytea-crc32c-bytea.patchapplication/octet-stream; name=v4-0001-Add-crc32-bytea-crc32c-bytea.patchDownload
From 250279b86de5e56d02afba2f004717d42790cc6b Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <aleksander@timescale.com>
Date: Thu, 18 Jul 2024 12:59:40 +0300
Subject: [PATCH v4] Add crc32(bytea) & crc32c(bytea)
Per user requests.
Aleksander Alekseev, reviewed by Nathan Bossart
Discussion: https://postgr.es/m/CAJ7c6TNMTGnqnG=yXXUQh9E88JDckmR45H2Q+=ucaCLMOW1QQw@mail.gmail.com
---
doc/src/sgml/func.sgml | 34 ++++++++++++++
src/backend/utils/hash/pg_crc.c | 56 ++++++++++++++++++++++++
src/include/catalog/pg_proc.dat | 8 ++++
src/test/regress/expected/opr_sanity.out | 2 +
src/test/regress/expected/strings.out | 27 ++++++++++++
src/test/regress/sql/strings.sql | 9 ++++
6 files changed, 136 insertions(+)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0f7154b76a..a0541d897f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4490,6 +4490,40 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>bytea</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32 value of the binary string.
+ </para>
+ <para>
+ <literal>crc32('PostgreSQL'::bytea)</literal>
+ <returnvalue>cb97b83b</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32c</primary>
+ </indexterm>
+ <function>crc32c</function> ( <type>bytea</type> )
+ <returnvalue>text</returnvalue>
+ </para>
+ <para>
+ Computes the CRC32C value of the binary string.
+ </para>
+ <para>
+ <literal>crc32c('PostgreSQL'::bytea)</literal>
+ <returnvalue>a7c16abd</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/hash/pg_crc.c b/src/backend/utils/hash/pg_crc.c
index 3595938dc4..6965a7f8d5 100644
--- a/src/backend/utils/hash/pg_crc.c
+++ b/src/backend/utils/hash/pg_crc.c
@@ -18,8 +18,12 @@
*/
#include "c.h"
+#include "postgres.h"
+#include "port/pg_crc32c.h"
+#include "utils/builtins.h"
#include "utils/pg_crc.h"
+#include "varatt.h"
/*
* Lookup table for calculating CRC-32 using Sarwate's algorithm.
@@ -95,3 +99,55 @@ const uint32 pg_crc32_table[256] = {
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};
+
+/*
+ * Calculate CRC32 of a bytea value and return it as text.
+ */
+Datum
+crc32_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32 crc;
+ char result[16];
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ INIT_TRADITIONAL_CRC32(crc);
+ COMP_TRADITIONAL_CRC32(crc, VARDATA_ANY(in), len);
+ FIN_TRADITIONAL_CRC32(crc);
+
+ /* format the hex string */
+ snprintf(result, sizeof(result), "%08x", (uint32)crc);
+
+ /* convert to text and return it */
+ PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/*
+ * Calculate CRC32C of a bytea value and return it as text.
+ */
+Datum
+crc32c_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ size_t len;
+ pg_crc32c crc;
+ char result[16];
+
+ /* calculate the length of the buffer using varlena metadata */
+ len = VARSIZE_ANY_EXHDR(in);
+
+ /* get the crc value */
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, VARDATA_ANY(in), len);
+ FIN_CRC32C(crc);
+
+ /* format the hex string */
+ snprintf(result, sizeof(result), "%08x", (uint32)crc);
+
+ /* convert to text and return it */
+ PG_RETURN_TEXT_P(cstring_to_text(result));
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d36f6001bb..c31b1c72ec 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7743,6 +7743,14 @@
proname => 'system', provolatile => 'v', prorettype => 'tsm_handler',
proargtypes => 'internal', prosrc => 'tsm_system_handler' },
+# non-cryptographic
+{ oid => '8571', descr => 'CRC32 value',
+ proname => 'crc32', proleakproof => 't', prorettype => 'text',
+ proargtypes => 'bytea', prosrc => 'crc32_bytea' },
+{ oid => '8572', descr => 'CRC32C value',
+ proname => 'crc32c', proleakproof => 't', prorettype => 'text',
+ proargtypes => 'bytea', prosrc => 'crc32c_bytea' },
+
# cryptographic
{ oid => '2311', descr => 'MD5 hash',
proname => 'md5', proleakproof => 't', prorettype => 'text',
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9d047b21b8..0d734169f1 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -874,6 +874,8 @@ xid8ne(xid8,xid8)
xid8cmp(xid8,xid8)
uuid_extract_timestamp(uuid)
uuid_extract_version(uuid)
+crc32(bytea)
+crc32c(bytea)
-- restore normal output mode
\a\t
-- List of functions used by libpq's fe-lobj.c
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 52b69a107f..c795df3056 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2255,6 +2255,33 @@ SELECT sha512('The quick brown fox jumps over the lazy dog.');
\x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed
(1 row)
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+ crc32
+----------
+ 00000000
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32
+----------
+ 519025e9
+(1 row)
+
+SELECT crc32c(''::bytea);
+ crc32c
+----------
+ 00000000
+(1 row)
+
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+ crc32c
+----------
+ 190097b3
+(1 row)
+
--
-- encode/decode
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3959678992..f8cd621d9a 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -719,6 +719,15 @@ SELECT sha384('The quick brown fox jumps over the lazy dog.');
SELECT sha512('');
SELECT sha512('The quick brown fox jumps over the lazy dog.');
+--
+-- CRC32
+--
+SELECT crc32(''::bytea);
+SELECT crc32('The quick brown fox jumps over the lazy dog.'::bytea);
+
+SELECT crc32c(''::bytea);
+SELECT crc32c('The quick brown fox jumps over the lazy dog.'::bytea);
+
--
-- encode/decode
--
--
2.45.2
On 05.08.24 17:28, Nathan Bossart wrote:
This looks pretty good to me. The only point that I think deserves more
discussion is the return type. Does bytea make the most sense here? Or
should we consider int/bigint?
The correct return type of a CRC operation in general is some kind of
exact numerical type. Just pick the best one that fits the result. I
don't think bytea is appropriate.
On Thu, Aug 08, 2024 at 04:27:20PM +0200, Peter Eisentraut wrote:
On 05.08.24 17:28, Nathan Bossart wrote:
This looks pretty good to me. The only point that I think deserves more
discussion is the return type. Does bytea make the most sense here? Or
should we consider int/bigint?The correct return type of a CRC operation in general is some kind of exact
numerical type. Just pick the best one that fits the result. I don't think
bytea is appropriate.
That would leave us either "integer" or "bigint". "integer" is more
correct from a size perspective, but will result in negative values because
it is signed. "bigint" uses twice as many bytes but won't display any CRC
values as negative.
I guess we could also choose "numeric", which would set a more sustainable
precedent if we added functions for CRC-64...
--
nathan
Nathan Bossart <nathandbossart@gmail.com> writes:
On Thu, Aug 08, 2024 at 04:27:20PM +0200, Peter Eisentraut wrote:
The correct return type of a CRC operation in general is some kind of exact
numerical type. Just pick the best one that fits the result. I don't think
bytea is appropriate.
That would leave us either "integer" or "bigint". "integer" is more
correct from a size perspective, but will result in negative values because
it is signed. "bigint" uses twice as many bytes but won't display any CRC
values as negative.
bigint seems fine to me; we have used that in other places as a
substitute for uint32, eg block numbers in contrib/pageinspect.
regards, tom lane
On Thu, Aug 08, 2024 at 10:49:42AM -0400, Tom Lane wrote:
Nathan Bossart <nathandbossart@gmail.com> writes:
On Thu, Aug 08, 2024 at 04:27:20PM +0200, Peter Eisentraut wrote:
The correct return type of a CRC operation in general is some kind of exact
numerical type. Just pick the best one that fits the result. I don't think
bytea is appropriate.That would leave us either "integer" or "bigint". "integer" is more
correct from a size perspective, but will result in negative values because
it is signed. "bigint" uses twice as many bytes but won't display any CRC
values as negative.bigint seems fine to me; we have used that in other places as a
substitute for uint32, eg block numbers in contrib/pageinspect.
WFM. Here is what I have staged for commit.
--
nathan
Attachments:
v5-0001-Add-user-callable-CRC-functions.patchtext/plain; charset=us-asciiDownload
From 9706e7f8fc373d8f0b4d1474c8adaa3a7230ce4e Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Thu, 8 Aug 2024 10:43:29 -0500
Subject: [PATCH v5 1/1] Add user-callable CRC functions.
We've had code for CRC-32 and CRC-32C for some time (for WAL
records, etc.), but there was no way for users to call it, despite
apparent popular demand. The new crc32() and crc32c() functions
both accept bytea and return bigint (to avoid returning negative
values).
XXX: NEEDS CATVERSION BUMP
Author: Aleksander Alekseev
Reviewed-by: Peter Eisentraut, Tom Lane
Discussion: https://postgr.es/m/CAJ7c6TNMTGnqnG%3DyXXUQh9E88JDckmR45H2Q%2B%3DucaCLMOW1QQw%40mail.gmail.com
---
doc/src/sgml/func.sgml | 34 +++++++++++++++++++++++
src/backend/utils/hash/pg_crc.c | 35 +++++++++++++++++++++++-
src/include/catalog/pg_proc.dat | 8 ++++++
src/test/regress/expected/opr_sanity.out | 2 ++
src/test/regress/expected/strings.out | 27 ++++++++++++++++++
src/test/regress/sql/strings.sql | 9 ++++++
6 files changed, 114 insertions(+), 1 deletion(-)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0f7154b76a..2a87668f4c 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4490,6 +4490,40 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32</primary>
+ </indexterm>
+ <function>crc32</function> ( <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Computes the CRC-32 value of the binary string.
+ </para>
+ <para>
+ <literal>crc32('abc'::bytea)</literal>
+ <returnvalue>891568578</returnvalue>
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>crc32c</primary>
+ </indexterm>
+ <function>crc32c</function> ( <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Computes the CRC-32C value of the binary string.
+ </para>
+ <para>
+ <literal>crc32c('abc'::bytea)</literal>
+ <returnvalue>910901175</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
diff --git a/src/backend/utils/hash/pg_crc.c b/src/backend/utils/hash/pg_crc.c
index 3595938dc4..e4a976a080 100644
--- a/src/backend/utils/hash/pg_crc.c
+++ b/src/backend/utils/hash/pg_crc.c
@@ -17,9 +17,12 @@
*-------------------------------------------------------------------------
*/
-#include "c.h"
+#include "postgres.h"
+#include "port/pg_crc32c.h"
+#include "utils/builtins.h"
#include "utils/pg_crc.h"
+#include "varatt.h"
/*
* Lookup table for calculating CRC-32 using Sarwate's algorithm.
@@ -95,3 +98,33 @@ const uint32 pg_crc32_table[256] = {
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};
+
+/*
+ * CRC variants
+ */
+
+Datum
+crc32_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ pg_crc32 crc;
+
+ INIT_TRADITIONAL_CRC32(crc);
+ COMP_TRADITIONAL_CRC32(crc, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+ FIN_TRADITIONAL_CRC32(crc);
+
+ PG_RETURN_INT64(crc);
+}
+
+Datum
+crc32c_bytea(PG_FUNCTION_ARGS)
+{
+ bytea *in = PG_GETARG_BYTEA_PP(0);
+ pg_crc32c crc;
+
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
+ FIN_CRC32C(crc);
+
+ PG_RETURN_INT64(crc);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d36f6001bb..ab29b581b0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7743,6 +7743,14 @@
proname => 'system', provolatile => 'v', prorettype => 'tsm_handler',
proargtypes => 'internal', prosrc => 'tsm_system_handler' },
+# CRC variants
+{ oid => '8571', descr => 'CRC-32 value',
+ proname => 'crc32', proleakproof => 't', prorettype => 'int8',
+ proargtypes => 'bytea', prosrc => 'crc32_bytea' },
+{ oid => '8572', descr => 'CRC-32C value',
+ proname => 'crc32c', proleakproof => 't', prorettype => 'int8',
+ proargtypes => 'bytea', prosrc => 'crc32c_bytea' },
+
# cryptographic
{ oid => '2311', descr => 'MD5 hash',
proname => 'md5', proleakproof => 't', prorettype => 'text',
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 9d047b21b8..0d734169f1 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -874,6 +874,8 @@ xid8ne(xid8,xid8)
xid8cmp(xid8,xid8)
uuid_extract_timestamp(uuid)
uuid_extract_version(uuid)
+crc32(bytea)
+crc32c(bytea)
-- restore normal output mode
\a\t
-- List of functions used by libpq's fe-lobj.c
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 52b69a107f..b65bb2d536 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2255,6 +2255,33 @@ SELECT sha512('The quick brown fox jumps over the lazy dog.');
\x91ea1245f20d46ae9a037a989f54f1f790f0a47607eeb8a14d12890cea77a1bbc6c7ed9cf205e67b7f2b8fd4c7dfd3a7a8617e45f3c463d481c7e586c39ac1ed
(1 row)
+--
+-- CRC
+--
+SELECT crc32('');
+ crc32
+-------
+ 0
+(1 row)
+
+SELECT crc32('The quick brown fox jumps over the lazy dog.');
+ crc32
+------------
+ 1368401385
+(1 row)
+
+SELECT crc32c('');
+ crc32c
+--------
+ 0
+(1 row)
+
+SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+ crc32c
+-----------
+ 419469235
+(1 row)
+
--
-- encode/decode
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3959678992..8e0f3a0e75 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -719,6 +719,15 @@ SELECT sha384('The quick brown fox jumps over the lazy dog.');
SELECT sha512('');
SELECT sha512('The quick brown fox jumps over the lazy dog.');
+--
+-- CRC
+--
+SELECT crc32('');
+SELECT crc32('The quick brown fox jumps over the lazy dog.');
+
+SELECT crc32c('');
+SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+
--
-- encode/decode
--
--
2.39.3 (Apple Git-146)
Hi,
WFM. Here is what I have staged for commit.
Patch v5 LGTM.
--
Best regards,
Aleksander Alekseev