popcount
Hi,
Per request, I'd like to see about surfacing $Subject to SQL.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
v1-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 28c9b7acad605197a8a242ff929bcce6f3100c91 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v1] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"
This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 5021ac1ca9..439a3a4a06 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>integer</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>popcount('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>int4</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>popcount(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
</tbody>
</tgroup>
</table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index 139f4a08bd..0e6d0636fa 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int4', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int4', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 3c03459f51..e92d1a7f8f 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -29,6 +29,8 @@
*-------------------------------------------------------------------------
*/
+#include <math.h>
+
#include "postgres.h"
#include "access/htup_details.h"
@@ -36,6 +38,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1872,3 +1875,24 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len, popcount;
+
+ len = ceil(VARBITLEN(arg1) / (float4)BITS_PER_BYTE);
+ p = VARBITS(arg1);
+
+ popcount = (int)pg_popcount((const char *)p, len);
+
+ PG_RETURN_INT32(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 9300d19e0c..9e788babd3 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3415,6 +3415,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len, result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT32(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a1fab7ebcb..c91ad3d0d5 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -681,6 +681,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount
+----------
+ 5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount
+----------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 298b6c48c2..9780639038 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2126,3 +2126,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount
+----------
+ 24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index 7681d4ab4d..3cd9fb65d3 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -207,6 +207,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ad5221ab6b..a35e993ab0 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -717,3 +717,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
--------------2.29.2--
David Fetter wrote:
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len, result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT32(result);
+}
The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?
There is a pre-existing function in core: bit_length(bytea) returning int4,
but it errors out for large values (in fact it's a SQL function that returns
octet_length($1)*8).
For the varbit case, it doesn't seem like it can hold so many bits, although
I don't see the limit documented in
https://www.postgresql.org/docs/current/datatype-bit.html
Best regards,
--
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite
On Wed, Dec 30, 2020 at 05:27:04PM +0100, Daniel Verite wrote:
David Fetter wrote:
+Datum +byteapopcount(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + int len, result; + + len = VARSIZE_ANY_EXHDR(t1); + result = pg_popcount(VARDATA_ANY(t1), len); + + PG_RETURN_INT32(result); +}The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?
It does now, and thanks for looking at this.
There is a pre-existing function in core: bit_length(bytea) returning int4,
but it errors out for large values (in fact it's a SQL function that returns
octet_length($1)*8).For the varbit case, it doesn't seem like it can hold so many bits, although
I don't see the limit documented in
https://www.postgresql.org/docs/current/datatype-bit.html
Out of an abundance of caution, I changed that one, too.
I'd noticed earlier that ceil() doesn't need to be quite as wasteful
as the way I had it earlier, and fixed that with help from Andrew
Gierth.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
v2-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 6d35100b3bc16c1c7d3bf08c62589f3e039cee76 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v2] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"
This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 5021ac1ca9..439a3a4a06 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>integer</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>popcount('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>int4</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>popcount(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
</tbody>
</tgroup>
</table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index 139f4a08bd..82e3d50114 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int8', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int8', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 3c03459f51..a71c2b6bf4 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1872,3 +1873,29 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len;
+ int8 popcount;
+
+ /*
+ * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+ * done to minimize branches and instructions.
+ */
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+ p = VARBITS(arg1);
+
+ popcount = pg_popcount((const char *)p, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 9300d19e0c..9c5e5b5592 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3415,6 +3415,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len;
+ int8 result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT64(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a1fab7ebcb..c91ad3d0d5 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -681,6 +681,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount
+----------
+ 5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount
+----------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 298b6c48c2..9780639038 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2126,3 +2126,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount
+----------
+ 24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index 7681d4ab4d..3cd9fb65d3 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -207,6 +207,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ad5221ab6b..a35e993ab0 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -717,3 +717,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
--------------2.29.2--
On 2020-12-30 17:41, David Fetter wrote:
The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?It does now, and thanks for looking at this.
The documentation still reflects the previous int4 return type (in two
different spellings, too).
On Mon, Jan 11, 2021 at 03:50:54PM +0100, Peter Eisentraut wrote:
On 2020-12-30 17:41, David Fetter wrote:
The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?It does now, and thanks for looking at this.
The documentation still reflects the previous int4 return type (in two
different spellings, too).
Thanks for looking this over!
Please find attached the next version with corrected documentation.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
v3-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From f0f8e0639a4d08db7d454d5181aa9d98d31264a3 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v3] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"
This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 02a37658ad..1d86d610dd 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>popcount('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>popcount</primary>
+ </indexterm>
+ <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>popcount(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
</tbody>
</tgroup>
</table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index d7b55f57ea..2d53e0d46d 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int8', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'popcount', prorettype => 'int8', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..a6a44f3f4f 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1878,3 +1879,29 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len;
+ int8 popcount;
+
+ /*
+ * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+ * done to minimize branches and instructions.
+ */
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+ p = VARBITS(arg1);
+
+ popcount = pg_popcount((const char *)p, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 4838bfb4ff..da3ba769c4 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3433,6 +3433,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len;
+ int8 result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT64(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..228f992ced 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount
+----------
+ 5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount
+----------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 595bd2446e..dd962e5f48 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2167,3 +2167,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount
+----------
+ 24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..fa77bf45c4 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ca465d050f..48964e71cb 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -728,3 +728,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
--------------2.29.2--
On 2021-01-11 17:13, David Fetter wrote:
On Mon, Jan 11, 2021 at 03:50:54PM +0100, Peter Eisentraut wrote:
On 2020-12-30 17:41, David Fetter wrote:
The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?It does now, and thanks for looking at this.
The documentation still reflects the previous int4 return type (in two
different spellings, too).Thanks for looking this over!
Please find attached the next version with corrected documentation.
The documentation entries should be placed in alphabetical order in
their respective tables.
For the bytea function, maybe choose a simpler example that a reader can
compute in their head. Also for the test cases.
In varbit.c:
The comment says
+ * Returns the number of bits set in a bit array.
but it should be "bit string".
+ int8 popcount;
should be int64.
Also, pg_popcount() returns uint64, not int64. Perhaps overflow is not
possible here, but perhaps a small comment about why or an assertion
could be appropriate.
+ p = VARBITS(arg1);
Why not assign that in the declaration block as well?
This comment:
+ /*
+ * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+ * done to minimize branches and instructions.
+ */
I don't know what that is supposed to mean or why that kind of tuning
would be necessary for a user-callable function.
+ popcount = pg_popcount((const char *)p, len);
The "const" is probably not necessary.
byteapopcount() looks better, but also needs int8 -> int64.
Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
[ assorted nits ]
At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().
Perhaps something along the lines of count_ones() or count_set_bits()
would be more apropos.
regards, tom lane
Peter Eisentraut wrote:
+ /* + * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE) + * done to minimize branches and instructions. + */I don't know what that is supposed to mean or why that kind of tuning
would be necessary for a user-callable function.
Also, the formula just below looks wrong in the current patch (v3):
+ /*
+ * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+ * done to minimize branches and instructions.
+ */
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+ p = VARBITS(arg1);
+
+ popcount = pg_popcount((const char *)p, len);
It should be
len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE
If we add 1 to BITS_PER_BYTE instead of subtracting 1, when
VARBITLEN(arg1) is a multiple of BITS_PER_BYTE, "len" is one byte too
large.
The correct formula is already used in include/utils/varbit.h near the
definition of VARBITLEN:
#define VARBITTOTALLEN(BITLEN) (((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE +
\
VARHDRSZ +
VARBITHDRSZ)
Best regards,
--
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite
On Mon, Jan 18, 2021 at 10:34:10AM -0500, Tom Lane wrote:
Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
[ assorted nits ]
fixed, I think.
At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().Perhaps something along the lines of count_ones() or count_set_bits()
would be more apropos.
Done that way.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
v4-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 92f3d5bf1718b1b52a5a4d792f5c75e0bcc7fb74 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v4] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"
This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index aa99665e2e..7626edeee7 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>count_set_bits</primary>
+ </indexterm>
+ <function>count_set_bits</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>count_set_bits('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>count_set_bits</primary>
+ </indexterm>
+ <function>count_set_bits</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>count_set_bits(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
</tbody>
</tgroup>
</table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index b5f52d4e4a..416ee0c2fb 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ /* There's really no chance of an overflow here because
+ * to get to INT64_MAX set bits, an object would have to be
+ * an exbibyte long, exceeding what PostgreSQL can currently
+ * store by a factor of 2^28
+ */
+ int64 popcount;
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len;
+
+ p = VARBITS(arg1);
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+ popcount = pg_popcount((char *)p, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 479ed9ae54..3f1179a0e8 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3439,6 +3439,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len;
+ int64 result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT64(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..1187076a8d 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+ count_set_bits
+----------------
+ 5
+(1 row)
+
+SELECT count_set_bits(B'1111111111'::bit(10));
+ count_set_bits
+----------------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 7c91afa6e4..e94e76000b 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2179,3 +2179,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
+ count_set_bits
+----------------
+ 24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..8893d38b0d 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+SELECT count_set_bits(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ef4bfb008a..a4f7879b16 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -730,3 +730,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
--------------2.29.2--
On 2021-01-18 16:34, Tom Lane wrote:
Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
[ assorted nits ]
At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().
I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight>, popcount is an accepted
high-level term, with "pop" also standing for "population".
On Tue, Jan 19, 2021 at 3:06 AM Peter Eisentraut
<peter.eisentraut@enterprisedb.com> wrote:
On 2021-01-18 16:34, Tom Lane wrote:
Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
[ assorted nits ]
At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight>, popcount is an accepted
high-level term, with "pop" also standing for "population".
Yeah, I am not sure that it's going to be good to invent our own name
for this, although maybe. But at least I think we should make sure
there are some good comments in an easily discoverable place. Some
people seem to think every programmer in the universe should know what
things like popcount() and fls() and ffs() and stuff like that are,
but it's far from obvious and I often have to refresh my memory. Let's
make it easy for someone to figure out, if they don't know already.
Like just a comment that says "this returns the number of 1 bits in
the integer supplied as an argument" or something can save somebody a
lot of trouble.
--
Robert Haas
EDB: http://www.enterprisedb.com
On Tue, Jan 19, 2021 at 07:58:12AM -0500, Robert Haas wrote:
On Tue, Jan 19, 2021 at 3:06 AM Peter Eisentraut
<peter.eisentraut@enterprisedb.com> wrote:On 2021-01-18 16:34, Tom Lane wrote:
Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:
[ assorted nits ]
At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight>, popcount is an accepted
high-level term, with "pop" also standing for "population".Yeah, I am not sure that it's going to be good to invent our own
name for this, although maybe. But at least I think we should make
sure there are some good comments in an easily discoverable place.
Some people seem to think every programmer in the universe should
know what things like popcount() and fls() and ffs() and stuff like
that are, but it's far from obvious and I often have to refresh my
memory. Let's make it easy for someone to figure out, if they don't
know already.
I went with count_set_bits() for the next version, and I hope that
helps clarify what it does.
Like just a comment that says "this returns the number of 1 bits in
the integer supplied as an argument" or something can save somebody
a lot of trouble.
You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
On Tue, 19 Jan 2021 at 11:38, David Fetter <david@fetter.org> wrote:
You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.
Since you mention \df+, I wonder if this is the time to consider removing
the source code from \df+ (since we have \sf) and adding in the proacl
instead?
On Tue, Jan 19, 2021 at 9:42 PM Isaac Morland <isaac.morland@gmail.com>
wrote:
On Tue, 19 Jan 2021 at 11:38, David Fetter <david@fetter.org> wrote:
You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.Since you mention \df+, I wonder if this is the time to consider removing
the source code from \df+ (since we have \sf) and adding in the proacl
instead?The cf bot failed to apply the patch (v4-0001-popcount.patch) because of
the wrong "-p"
I have regenerated the patch, can you please take a look.
http://cfbot.cputube.org/patch_32_2917.log
=== applying patch ./v4-0001-popcount.patch
can't find file to patch at input line 21
Perhaps you used the wrong -p or --strip option?
The text leading up to this was:
--------------------------
--
Ibrar Ahmed
Attachments:
v5-0001-popcount.patchapplication/octet-stream; name=v5-0001-popcount.patchDownload
From 92f3d5bf1718b1b52a5a4d792f5c75e0bcc7fb74 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v4] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"
This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ece09699ef..45cfc22ae9 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>count_set_bits</primary>
+ </indexterm>
+ <function>count_set_bits</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>count_set_bits('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>count_set_bits</primary>
+ </indexterm>
+ <function>count_set_bits</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>count_set_bits(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
</tbody>
</tgroup>
</table>
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ /* There's really no chance of an overflow here because
+ * to get to INT64_MAX set bits, an object would have to be
+ * an exbibyte long, exceeding what PostgreSQL can currently
+ * store by a factor of 2^28
+ */
+ int64 popcount;
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len;
+
+ p = VARBITS(arg1);
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+ popcount = pg_popcount((char *)p, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 479ed9ae54..3f1179a0e8 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -3439,6 +3439,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len;
+ int64 result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT64(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 506689d8ac..6f0ef32b87 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3876,6 +3879,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out
index a7f95b846d..1187076a8d 100644
--- a/src/test/regress/expected/bit.out
+++ b/src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+ count_set_bits
+----------------
+ 5
+(1 row)
+
+SELECT count_set_bits(B'1111111111'::bit(10));
+ count_set_bits
+----------------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index fb4573d85f..5f27522187 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2227,3 +2227,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
+ count_set_bits
+----------------
+ 24
+(1 row)
+
diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql
index ea01742c4a..8893d38b0d 100644
--- a/src/test/regress/sql/bit.sql
+++ b/src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+SELECT count_set_bits(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 57a48c9d0b..523f960d5b 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -742,3 +742,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
Hi David,
Just a nitpick:
+SET bytea_output TO hex;
Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?
Aside from that, this patch works as expected, and is ready for committer.
--
John Naylor
EDB: http://www.enterprisedb.com
On 18.03.21 13:51, John Naylor wrote:
Hi David,
Just a nitpick:
+SET bytea_output TO hex;
Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?Aside from that, this patch works as expected, and is ready for committer.
I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).
On Sat, Mar 20, 2021 at 09:01:25PM +0100, Peter Eisentraut wrote:
On 18.03.21 13:51, John Naylor wrote:
Hi David,
Just a nitpick:
+SET bytea_output TO hex;
Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?
That's how I got it to work. If there's a way to make it go without
that, I'd be delighted to learn what it is :)
Aside from that, this patch works as expected, and is ready for committer.
I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).
Thanks for doing this tedious work. Please find attached the next
version of the patch.
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate
Attachments:
v5-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 9483d41941b5daa44e46e6fb164846647d716406 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v5] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.30.2"
This is a multi-part message in MIME format.
--------------2.30.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit
Now it's accessible to SQL for the BIT VARYING and BYTEA types.
diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 68fe6a95b4..066431fd3c 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>bit_count</primary>
+ </indexterm>
+ <function>bit_count</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the number of bits set in a binary string.
+ </para>
+ <para>
+ <literal>bit_count('\xdeadbeef'::bytea)</literal>
+ <returnvalue>24</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
</para></entry>
</row>
+ <row>
+ <entry role="func_table_entry"><para role="func_signature">
+ <indexterm>
+ <primary>bit_count</primary>
+ </indexterm>
+ <function>bit_count</function> ( <parameter>bits</parameter> <type>bit</type> )
+ <returnvalue>bigint</returnvalue>
+ </para>
+ <para>
+ Counts the bits set in a bit string.
+ </para>
+ <para>
+ <literal>bit_count(B'101010101010101010')</literal>
+ <returnvalue>9</returnvalue>
+ </para></entry>
+ </row>
+
<row>
<entry role="func_table_entry"><para role="func_signature">
<indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
<returnvalue>101010001010101010</returnvalue>
</para></entry>
</row>
+
</tbody>
</tgroup>
</table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index e259531f60..feb00eccf9 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
{ oid => '752', descr => 'substitute portion of string',
proname => 'overlay', prorettype => 'bytea',
proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+ proname => 'bit_count', prorettype => 'int8', proargtypes => 'bytea',
+ prosrc => 'byteapopcount'},
{ oid => '725',
proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3876,6 +3879,9 @@
{ oid => '3033', descr => 'set bit',
proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+ proname => 'bit_count', prorettype => 'int8', proargtypes => 'bit',
+ prosrc => 'bitpopcount'},
# for macaddr type support
{ oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
#include "libpq/pqformat.h"
#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
else
PG_RETURN_INT32(0);
}
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+ /* There's really no chance of an overflow here because
+ * to get to INT64_MAX set bits, an object would have to be
+ * an exbibyte long, exceeding what PostgreSQL can currently
+ * store by a factor of 2^28
+ */
+ int64 popcount;
+ VarBit *arg1 = PG_GETARG_VARBIT_P(0);
+ bits8 *p;
+ int len;
+
+ p = VARBITS(arg1);
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+ popcount = pg_popcount((char *)p, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 0bc345aa4d..95091887a9 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3440,6 +3440,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
return result;
}
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+ bytea *t1 = PG_GETARG_BYTEA_PP(0);
+ int len;
+ int64 result;
+
+ len = VARSIZE_ANY_EXHDR(t1);
+ result = pg_popcount(VARDATA_ANY(t1), len);
+
+ PG_RETURN_INT64(result);
+}
+
/*
* byteapos -
* Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..9b6b3d0c4f 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
0101011100001
(1 row)
+-- Popcount
+SELECT bit_count(B'0101011100'::bit(10));
+ bit_count
+-----------
+ 5
+(1 row)
+
+SELECT bit_count(B'1111111111'::bit(10));
+ bit_count
+-----------
+ 10
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index fb4573d85f..e8c6a99e9d 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2227,3 +2227,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT bit_count(E'\\xdeadbeef'::bytea);
+ bit_count
+-----------
+ 24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..271aa5ea3c 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
SELECT overlay(B'0101011100' placing '001' from 11);
SELECT overlay(B'0101011100' placing '001' from 20);
+-- Popcount
+SELECT bit_count(B'0101011100'::bit(10));
+SELECT bit_count(B'1111111111'::bit(10));
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index 57a48c9d0b..3fb2d66a9f 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -742,3 +742,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT bit_count(E'\\xdeadbeef'::bytea);
--------------2.30.2--
On 21.03.21 02:31, David Fetter wrote:
I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).Thanks for doing this tedious work. Please find attached the next
version of the patch.
committing, with some polishing
On Tue, Mar 23, 2021 at 10:51:08AM +0100, Peter Eisentraut wrote:
On 21.03.21 02:31, David Fetter wrote:
I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).Thanks for doing this tedious work. Please find attached the next
version of the patch.committing, with some polishing
Thanks!
Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778
Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate