popcount

Started by David Fetterabout 5 years ago19 messages
#1David Fetter
david@fetter.org
1 attachment(s)

Hi,

Per request, I'd like to see about surfacing $Subject to SQL.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

Attachments:

v1-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 28c9b7acad605197a8a242ff929bcce6f3100c91 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v1] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"

This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 5021ac1ca9..439a3a4a06 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>integer</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>popcount('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>int4</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>popcount(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index 139f4a08bd..0e6d0636fa 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int4', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int4', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 3c03459f51..e92d1a7f8f 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -29,6 +29,8 @@
  *-------------------------------------------------------------------------
  */
 
+#include <math.h>
+
 #include "postgres.h"
 
 #include "access/htup_details.h"
@@ -36,6 +38,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1872,3 +1875,24 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len, popcount;
+
+	len = ceil(VARBITLEN(arg1) / (float4)BITS_PER_BYTE);
+	p = VARBITS(arg1);
+
+	popcount = (int)pg_popcount((const char *)p, len);
+
+	PG_RETURN_INT32(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 9300d19e0c..9e788babd3 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3415,6 +3415,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len, result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT32(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a1fab7ebcb..c91ad3d0d5 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -681,6 +681,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount 
+----------
+        5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount 
+----------
+       10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 298b6c48c2..9780639038 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2126,3 +2126,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount 
+----------
+       24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index 7681d4ab4d..3cd9fb65d3 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -207,6 +207,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ad5221ab6b..a35e993ab0 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -717,3 +717,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);

--------------2.29.2--


#2Daniel Verite
daniel@manitou-mail.org
In reply to: David Fetter (#1)
Re: popcount

David Fetter wrote:

+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len, result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT32(result);
+}

The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?

There is a pre-existing function in core: bit_length(bytea) returning int4,
but it errors out for large values (in fact it's a SQL function that returns
octet_length($1)*8).

For the varbit case, it doesn't seem like it can hold so many bits, although
I don't see the limit documented in
https://www.postgresql.org/docs/current/datatype-bit.html

Best regards,
--
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite

#3David Fetter
david@fetter.org
In reply to: Daniel Verite (#2)
1 attachment(s)
Re: popcount

On Wed, Dec 30, 2020 at 05:27:04PM +0100, Daniel Verite wrote:

David Fetter wrote:

+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len, result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT32(result);
+}

The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?

It does now, and thanks for looking at this.

There is a pre-existing function in core: bit_length(bytea) returning int4,
but it errors out for large values (in fact it's a SQL function that returns
octet_length($1)*8).

For the varbit case, it doesn't seem like it can hold so many bits, although
I don't see the limit documented in
https://www.postgresql.org/docs/current/datatype-bit.html

Out of an abundance of caution, I changed that one, too.

I'd noticed earlier that ceil() doesn't need to be quite as wasteful
as the way I had it earlier, and fixed that with help from Andrew
Gierth.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

Attachments:

v2-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 6d35100b3bc16c1c7d3bf08c62589f3e039cee76 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v2] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"

This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 5021ac1ca9..439a3a4a06 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>integer</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>popcount('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>int4</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>popcount(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index 139f4a08bd..82e3d50114 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int8', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int8', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 3c03459f51..a71c2b6bf4 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1872,3 +1873,29 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len;
+	int8		popcount;
+
+	/*
+	 * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+	 * done to minimize branches and instructions.
+	 */
+	len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+	p = VARBITS(arg1);
+
+	popcount = pg_popcount((const char *)p, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 9300d19e0c..9c5e5b5592 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3415,6 +3415,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len;
+	int8	result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT64(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a1fab7ebcb..c91ad3d0d5 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -681,6 +681,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount 
+----------
+        5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount 
+----------
+       10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 298b6c48c2..9780639038 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2126,3 +2126,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount 
+----------
+       24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index 7681d4ab4d..3cd9fb65d3 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -207,6 +207,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ad5221ab6b..a35e993ab0 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -717,3 +717,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);

--------------2.29.2--


#4Peter Eisentraut
peter.eisentraut@enterprisedb.com
In reply to: David Fetter (#3)
Re: popcount

On 2020-12-30 17:41, David Fetter wrote:

The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?

It does now, and thanks for looking at this.

The documentation still reflects the previous int4 return type (in two
different spellings, too).

#5David Fetter
david@fetter.org
In reply to: Peter Eisentraut (#4)
1 attachment(s)
Re: popcount

On Mon, Jan 11, 2021 at 03:50:54PM +0100, Peter Eisentraut wrote:

On 2020-12-30 17:41, David Fetter wrote:

The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?

It does now, and thanks for looking at this.

The documentation still reflects the previous int4 return type (in two
different spellings, too).

Thanks for looking this over!

Please find attached the next version with corrected documentation.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

Attachments:

v3-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From f0f8e0639a4d08db7d454d5181aa9d98d31264a3 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v3] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"

This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 02a37658ad..1d86d610dd 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4069,6 +4069,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>popcount('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,24 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>popcount</primary>
+        </indexterm>
+        <function>popcount</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>popcount(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index d7b55f57ea..2d53e0d46d 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int8', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'popcount', prorettype => 'int8', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..a6a44f3f4f 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1878,3 +1879,29 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit array.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len;
+	int8		popcount;
+
+	/*
+	 * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+	 * done to minimize branches and instructions.
+	 */
+	len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+	p = VARBITS(arg1);
+
+	popcount = pg_popcount((const char *)p, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 4838bfb4ff..da3ba769c4 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3433,6 +3433,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len;
+	int8	result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT64(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..228f992ced 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+ popcount 
+----------
+        5
+(1 row)
+
+SELECT popcount(B'1111111111'::bit(10));
+ popcount 
+----------
+       10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 595bd2446e..dd962e5f48 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2167,3 +2167,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);
+ popcount 
+----------
+       24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..fa77bf45c4 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT popcount(B'0101011100'::bit(10));
+SELECT popcount(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ca465d050f..48964e71cb 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -728,3 +728,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT popcount(E'\\xdeadbeef'::bytea);

--------------2.29.2--


#6Peter Eisentraut
peter.eisentraut@enterprisedb.com
In reply to: David Fetter (#5)
Re: popcount

On 2021-01-11 17:13, David Fetter wrote:

On Mon, Jan 11, 2021 at 03:50:54PM +0100, Peter Eisentraut wrote:

On 2020-12-30 17:41, David Fetter wrote:

The input may have more than 2 billion bits set to 1. The biggest possible
result should be 8 billion for bytea (1 GB with all bits set to 1).
So shouldn't this function return an int8?

It does now, and thanks for looking at this.

The documentation still reflects the previous int4 return type (in two
different spellings, too).

Thanks for looking this over!

Please find attached the next version with corrected documentation.

The documentation entries should be placed in alphabetical order in
their respective tables.

For the bytea function, maybe choose a simpler example that a reader can
compute in their head. Also for the test cases.

In varbit.c:

The comment says

+ * Returns the number of bits set in a bit array.

but it should be "bit string".

+ int8 popcount;

should be int64.

Also, pg_popcount() returns uint64, not int64. Perhaps overflow is not
possible here, but perhaps a small comment about why or an assertion
could be appropriate.

+ p = VARBITS(arg1);

Why not assign that in the declaration block as well?

This comment:

+   /*
+    * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+    * done to minimize branches and instructions.
+    */

I don't know what that is supposed to mean or why that kind of tuning
would be necessary for a user-callable function.

+ popcount = pg_popcount((const char *)p, len);

The "const" is probably not necessary.

byteapopcount() looks better, but also needs int8 -> int64.

#7Tom Lane
tgl@sss.pgh.pa.us
In reply to: Peter Eisentraut (#6)
Re: popcount

Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:

[ assorted nits ]

At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().

Perhaps something along the lines of count_ones() or count_set_bits()
would be more apropos.

regards, tom lane

#8Daniel Verite
daniel@manitou-mail.org
In reply to: Peter Eisentraut (#6)
Re: popcount

Peter Eisentraut wrote:

+   /*
+    * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+    * done to minimize branches and instructions.
+    */

I don't know what that is supposed to mean or why that kind of tuning
would be necessary for a user-callable function.

Also, the formula just below looks wrong in the current patch (v3):

+ /*
+  * ceil(VARBITLEN(ARG1)/BITS_PER_BYTE)
+  * done to minimize branches and instructions.
+  */
+ len = (VARBITLEN(arg1) + BITS_PER_BYTE + 1) / BITS_PER_BYTE;
+ p = VARBITS(arg1);
+
+ popcount = pg_popcount((const char *)p, len);

It should be
len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE

If we add 1 to BITS_PER_BYTE instead of subtracting 1, when
VARBITLEN(arg1) is a multiple of BITS_PER_BYTE, "len" is one byte too
large.

The correct formula is already used in include/utils/varbit.h near the
definition of VARBITLEN:

#define VARBITTOTALLEN(BITLEN) (((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE +
\
VARHDRSZ +
VARBITHDRSZ)

Best regards,
--
Daniel Vérité
PostgreSQL-powered mailer: https://www.manitou-mail.org
Twitter: @DanielVerite

#9David Fetter
david@fetter.org
In reply to: Tom Lane (#7)
1 attachment(s)
Re: popcount

On Mon, Jan 18, 2021 at 10:34:10AM -0500, Tom Lane wrote:

Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:

[ assorted nits ]

fixed, I think.

At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().

Perhaps something along the lines of count_ones() or count_set_bits()
would be more apropos.

Done that way.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

Attachments:

v4-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 92f3d5bf1718b1b52a5a4d792f5c75e0bcc7fb74 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v4] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"

This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index aa99665e2e..7626edeee7 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
       </para></entry>
      </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>count_set_bits</primary>
+        </indexterm>
+        <function>count_set_bits</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>count_set_bits('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>count_set_bits</primary>
+        </indexterm>
+        <function>count_set_bits</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>count_set_bits(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index b5f52d4e4a..416ee0c2fb 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3865,6 +3868,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	/* There's really no chance of an overflow here because
+	 * to get to INT64_MAX set bits, an object would have to be
+	 * an exbibyte long, exceeding what PostgreSQL can currently
+	 * store by a factor of 2^28
+	 */
+	int64		popcount;
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len;
+
+	p = VARBITS(arg1);
+	len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+	popcount = pg_popcount((char *)p, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 479ed9ae54..3f1179a0e8 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3439,6 +3439,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len;
+	int64	result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT64(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..1187076a8d 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+ count_set_bits 
+----------------
+              5
+(1 row)
+
+SELECT count_set_bits(B'1111111111'::bit(10));
+ count_set_bits 
+----------------
+             10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index 7c91afa6e4..e94e76000b 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2179,3 +2179,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
+ count_set_bits 
+----------------
+             24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..8893d38b0d 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+SELECT count_set_bits(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index ef4bfb008a..a4f7879b16 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -730,3 +730,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);

--------------2.29.2--


#10Peter Eisentraut
peter.eisentraut@enterprisedb.com
In reply to: Tom Lane (#7)
Re: popcount

On 2021-01-18 16:34, Tom Lane wrote:

Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:

[ assorted nits ]

At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().

I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight&gt;, popcount is an accepted
high-level term, with "pop" also standing for "population".

#11Robert Haas
robertmhaas@gmail.com
In reply to: Peter Eisentraut (#10)
Re: popcount

On Tue, Jan 19, 2021 at 3:06 AM Peter Eisentraut
<peter.eisentraut@enterprisedb.com> wrote:

On 2021-01-18 16:34, Tom Lane wrote:

Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:

[ assorted nits ]

At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().

I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight&gt;, popcount is an accepted
high-level term, with "pop" also standing for "population".

Yeah, I am not sure that it's going to be good to invent our own name
for this, although maybe. But at least I think we should make sure
there are some good comments in an easily discoverable place. Some
people seem to think every programmer in the universe should know what
things like popcount() and fls() and ffs() and stuff like that are,
but it's far from obvious and I often have to refresh my memory. Let's
make it easy for someone to figure out, if they don't know already.
Like just a comment that says "this returns the number of 1 bits in
the integer supplied as an argument" or something can save somebody a
lot of trouble.

--
Robert Haas
EDB: http://www.enterprisedb.com

#12David Fetter
david@fetter.org
In reply to: Robert Haas (#11)
Re: popcount

On Tue, Jan 19, 2021 at 07:58:12AM -0500, Robert Haas wrote:

On Tue, Jan 19, 2021 at 3:06 AM Peter Eisentraut
<peter.eisentraut@enterprisedb.com> wrote:

On 2021-01-18 16:34, Tom Lane wrote:

Peter Eisentraut <peter.eisentraut@enterprisedb.com> writes:

[ assorted nits ]

At the level of bikeshedding ... I quite dislike using the name "popcount"
for these functions. I'm aware that some C compilers provide primitives
of that name, but I wouldn't expect a SQL programmer to know that;
without that context the name seems pretty random and unintuitive.
Moreover, it invites confusion with SQL's use of "pop" to abbreviate
"population" in the statistical aggregates, such as var_pop().

I was thinking about that too, but according to
<https://en.wikipedia.org/wiki/Hamming_weight&gt;, popcount is an accepted
high-level term, with "pop" also standing for "population".

Yeah, I am not sure that it's going to be good to invent our own
name for this, although maybe. But at least I think we should make
sure there are some good comments in an easily discoverable place.
Some people seem to think every programmer in the universe should
know what things like popcount() and fls() and ffs() and stuff like
that are, but it's far from obvious and I often have to refresh my
memory. Let's make it easy for someone to figure out, if they don't
know already.

I went with count_set_bits() for the next version, and I hope that
helps clarify what it does.

Like just a comment that says "this returns the number of 1 bits in
the integer supplied as an argument" or something can save somebody
a lot of trouble.

You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

#13Isaac Morland
isaac.morland@gmail.com
In reply to: David Fetter (#12)
Re: popcount

On Tue, 19 Jan 2021 at 11:38, David Fetter <david@fetter.org> wrote:

You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.

Since you mention \df+, I wonder if this is the time to consider removing
the source code from \df+ (since we have \sf) and adding in the proacl
instead?

#14Ibrar Ahmed
ibrar.ahmad@gmail.com
In reply to: Isaac Morland (#13)
1 attachment(s)
Re: popcount

On Tue, Jan 19, 2021 at 9:42 PM Isaac Morland <isaac.morland@gmail.com>
wrote:

On Tue, 19 Jan 2021 at 11:38, David Fetter <david@fetter.org> wrote:

You bring up an excellent point, which is that our builtin functions
could use a lot more documentation directly to hand than they now
have. For example, there's a lot of needless ambiguity created by
function comments which leave it up in the air as to which positional
argument does what in functions like string_to_array, which take
multiple arguments. I'll try to get a patch in for the next CF with a
fix for that, and a separate one that doesn't put it on people to use
\df+ to find the comments we do provide. There have been proposals for
including an optional space for COMMENT ON in DDL, but I suspect that
those won't fly unless and until they make their way into the
standard.

Since you mention \df+, I wonder if this is the time to consider removing
the source code from \df+ (since we have \sf) and adding in the proacl
instead?

The cf bot failed to apply the patch (v4-0001-popcount.patch) because of

the wrong "-p"
I have regenerated the patch, can you please take a look.

http://cfbot.cputube.org/patch_32_2917.log

=== applying patch ./v4-0001-popcount.patch
can't find file to patch at input line 21
Perhaps you used the wrong -p or --strip option?
The text leading up to this was:
--------------------------

--
Ibrar Ahmed

Attachments:

v5-0001-popcount.patchapplication/octet-stream; name=v5-0001-popcount.patchDownload
From 92f3d5bf1718b1b52a5a4d792f5c75e0bcc7fb74 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v4] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.29.2"

This is a multi-part message in MIME format.
--------------2.29.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index ece09699ef..45cfc22ae9 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
       </para></entry>
      </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>count_set_bits</primary>
+        </indexterm>
+        <function>count_set_bits</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>count_set_bits('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>count_set_bits</primary>
+        </indexterm>
+        <function>count_set_bits</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>count_set_bits(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	/* There's really no chance of an overflow here because
+	 * to get to INT64_MAX set bits, an object would have to be
+	 * an exbibyte long, exceeding what PostgreSQL can currently
+	 * store by a factor of 2^28
+	 */
+	int64		popcount;
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len;
+
+	p = VARBITS(arg1);
+	len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+	popcount = pg_popcount((char *)p, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 479ed9ae54..3f1179a0e8 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -3439,6 +3439,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len;
+	int64	result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT64(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 506689d8ac..6f0ef32b87 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3876,6 +3879,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'count_set_bits', prorettype => 'int8', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out
index a7f95b846d..1187076a8d 100644
--- a/src/test/regress/expected/bit.out
+++ b/src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+ count_set_bits 
+----------------
+              5
+(1 row)
+
+SELECT count_set_bits(B'1111111111'::bit(10));
+ count_set_bits 
+----------------
+             10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index fb4573d85f..5f27522187 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2227,3 +2227,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
+ count_set_bits 
+----------------
+             24
+(1 row)
+
diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql
index ea01742c4a..8893d38b0d 100644
--- a/src/test/regress/sql/bit.sql
+++ b/src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT count_set_bits(B'0101011100'::bit(10));
+SELECT count_set_bits(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 57a48c9d0b..523f960d5b 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -742,3 +742,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT count_set_bits(E'\\xdeadbeef'::bytea);
#15John Naylor
john.naylor@enterprisedb.com
In reply to: Ibrar Ahmed (#14)
Re: popcount

Hi David,

Just a nitpick:

+SET bytea_output TO hex;

Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?

Aside from that, this patch works as expected, and is ready for committer.
--
John Naylor
EDB: http://www.enterprisedb.com

#16Peter Eisentraut
peter.eisentraut@enterprisedb.com
In reply to: John Naylor (#15)
Re: popcount

On 18.03.21 13:51, John Naylor wrote:

Hi David,

Just a nitpick:

+SET bytea_output TO hex;

Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?

Aside from that, this patch works as expected, and is ready for committer.

I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).

#17David Fetter
david@fetter.org
In reply to: Peter Eisentraut (#16)
1 attachment(s)
Re: popcount

On Sat, Mar 20, 2021 at 09:01:25PM +0100, Peter Eisentraut wrote:

On 18.03.21 13:51, John Naylor wrote:

Hi David,

Just a nitpick:

+SET bytea_output TO hex;

Since we don't see the string in the output, I don't immediately see the
reason to change the output format here?

That's how I got it to work. If there's a way to make it go without
that, I'd be delighted to learn what it is :)

Aside from that, this patch works as expected, and is ready for committer.

I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).

Thanks for doing this tedious work. Please find attached the next
version of the patch.

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

Attachments:

v5-0001-popcount.patchtext/x-diff; charset=us-asciiDownload
From 9483d41941b5daa44e46e6fb164846647d716406 Mon Sep 17 00:00:00 2001
From: David Fetter <david@fetter.org>
Date: Wed, 30 Dec 2020 02:51:46 -0800
Subject: [PATCH v5] popcount
To: hackers
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------2.30.2"

This is a multi-part message in MIME format.
--------------2.30.2
Content-Type: text/plain; charset=UTF-8; format=fixed
Content-Transfer-Encoding: 8bit


Now it's accessible to SQL for the BIT VARYING and BYTEA types.

diff --git doc/src/sgml/func.sgml doc/src/sgml/func.sgml
index 68fe6a95b4..066431fd3c 100644
--- doc/src/sgml/func.sgml
+++ doc/src/sgml/func.sgml
@@ -4030,6 +4030,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
       </para></entry>
      </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>bit_count</primary>
+        </indexterm>
+        <function>bit_count</function> ( <parameter>bytes</parameter> <type>bytea</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the number of bits set in a binary string.
+       </para>
+       <para>
+        <literal>bit_count('\xdeadbeef'::bytea)</literal>
+        <returnvalue>24</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4830,6 +4847,23 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>bit_count</primary>
+        </indexterm>
+        <function>bit_count</function> ( <parameter>bits</parameter> <type>bit</type> )
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Counts the bits set in a bit string.
+       </para>
+       <para>
+        <literal>bit_count(B'101010101010101010')</literal>
+        <returnvalue>9</returnvalue>
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -4869,6 +4903,7 @@ SELECT format('Testing %3$s, %2$s, %s', 'one', 'two', 'three');
         <returnvalue>101010001010101010</returnvalue>
        </para></entry>
       </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git src/include/catalog/pg_proc.dat src/include/catalog/pg_proc.dat
index e259531f60..feb00eccf9 100644
--- src/include/catalog/pg_proc.dat
+++ src/include/catalog/pg_proc.dat
@@ -1446,6 +1446,9 @@
 { oid => '752', descr => 'substitute portion of string',
   proname => 'overlay', prorettype => 'bytea',
   proargtypes => 'bytea bytea int4', prosrc => 'byteaoverlay_no_len' },
+{ oid => '8436', descr => 'count set bits',
+  proname => 'bit_count', prorettype => 'int8', proargtypes => 'bytea',
+  prosrc => 'byteapopcount'},
 
 { oid => '725',
   proname => 'dist_pl', prorettype => 'float8', proargtypes => 'point line',
@@ -3876,6 +3879,9 @@
 { oid => '3033', descr => 'set bit',
   proname => 'set_bit', prorettype => 'bit', proargtypes => 'bit int4 int4',
   prosrc => 'bitsetbit' },
+{ oid => '8435', descr => 'count set bits',
+  proname => 'bit_count', prorettype => 'int8', proargtypes => 'bit',
+  prosrc => 'bitpopcount'},
 
 # for macaddr type support
 { oid => '436', descr => 'I/O',
diff --git src/backend/utils/adt/varbit.c src/backend/utils/adt/varbit.c
index 2235866244..c9c6c73422 100644
--- src/backend/utils/adt/varbit.c
+++ src/backend/utils/adt/varbit.c
@@ -36,6 +36,7 @@
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/supportnodes.h"
+#include "port/pg_bitutils.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/varbit.h"
@@ -1878,3 +1879,30 @@ bitgetbit(PG_FUNCTION_ARGS)
 	else
 		PG_RETURN_INT32(0);
 }
+
+/*
+ * bitpopcount
+ *
+ * Returns the number of bits set in a bit string.
+ *
+ */
+Datum
+bitpopcount(PG_FUNCTION_ARGS)
+{
+	/* There's really no chance of an overflow here because
+	 * to get to INT64_MAX set bits, an object would have to be
+	 * an exbibyte long, exceeding what PostgreSQL can currently
+	 * store by a factor of 2^28
+	 */
+	int64		popcount;
+	VarBit		*arg1 = PG_GETARG_VARBIT_P(0);
+	bits8		*p;
+	int			len;
+
+	p = VARBITS(arg1);
+	len = (VARBITLEN(arg1) + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
+
+	popcount = pg_popcount((char *)p, len);
+
+	PG_RETURN_INT64(popcount);
+}
diff --git src/backend/utils/adt/varlena.c src/backend/utils/adt/varlena.c
index 0bc345aa4d..95091887a9 100644
--- src/backend/utils/adt/varlena.c
+++ src/backend/utils/adt/varlena.c
@@ -3440,6 +3440,22 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
 	return result;
 }
 
+/*
+ * popcount
+ */
+Datum
+byteapopcount(PG_FUNCTION_ARGS)
+{
+	bytea	*t1 = PG_GETARG_BYTEA_PP(0);
+	int		len;
+	int64	result;
+
+	len = VARSIZE_ANY_EXHDR(t1);
+	result = pg_popcount(VARDATA_ANY(t1), len);
+
+	PG_RETURN_INT64(result);
+}
+
 /*
  * byteapos -
  *	  Return the position of the specified substring.
diff --git src/test/regress/expected/bit.out src/test/regress/expected/bit.out
index a7f95b846d..9b6b3d0c4f 100644
--- src/test/regress/expected/bit.out
+++ src/test/regress/expected/bit.out
@@ -710,6 +710,19 @@ SELECT overlay(B'0101011100' placing '001' from 20);
  0101011100001
 (1 row)
 
+-- Popcount
+SELECT bit_count(B'0101011100'::bit(10));
+ bit_count 
+-----------
+         5
+(1 row)
+
+SELECT bit_count(B'1111111111'::bit(10));
+ bit_count 
+-----------
+        10
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/expected/strings.out src/test/regress/expected/strings.out
index fb4573d85f..e8c6a99e9d 100644
--- src/test/regress/expected/strings.out
+++ src/test/regress/expected/strings.out
@@ -2227,3 +2227,10 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
  Th\000o\x02\x03
 (1 row)
 
+SET bytea_output TO hex;
+SELECT bit_count(E'\\xdeadbeef'::bytea);
+ bit_count 
+-----------
+        24
+(1 row)
+
diff --git src/test/regress/sql/bit.sql src/test/regress/sql/bit.sql
index ea01742c4a..271aa5ea3c 100644
--- src/test/regress/sql/bit.sql
+++ src/test/regress/sql/bit.sql
@@ -215,6 +215,10 @@ SELECT overlay(B'0101011100' placing '101' from 6);
 SELECT overlay(B'0101011100' placing '001' from 11);
 SELECT overlay(B'0101011100' placing '001' from 20);
 
+-- Popcount
+SELECT bit_count(B'0101011100'::bit(10));
+SELECT bit_count(B'1111111111'::bit(10));
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git src/test/regress/sql/strings.sql src/test/regress/sql/strings.sql
index 57a48c9d0b..3fb2d66a9f 100644
--- src/test/regress/sql/strings.sql
+++ src/test/regress/sql/strings.sql
@@ -742,3 +742,6 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
 SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT bit_count(E'\\xdeadbeef'::bytea);

--------------2.30.2--


#18Peter Eisentraut
peter.eisentraut@enterprisedb.com
In reply to: David Fetter (#17)
Re: popcount

On 21.03.21 02:31, David Fetter wrote:

I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).

Thanks for doing this tedious work. Please find attached the next
version of the patch.

committing, with some polishing

#19David Fetter
david@fetter.org
In reply to: Peter Eisentraut (#18)
Re: popcount

On Tue, Mar 23, 2021 at 10:51:08AM +0100, Peter Eisentraut wrote:

On 21.03.21 02:31, David Fetter wrote:

I have now read the entire internet on what a suitable name for this
function could be. I think the emerging winner is BIT_COUNT(), which
already exists in MySQL, and also in Python (int.bit_count()) and Java
(Integer.bitCount()).

Thanks for doing this tedious work. Please find attached the next
version of the patch.

committing, with some polishing

Thanks!

Best,
David.
--
David Fetter <david(at)fetter(dot)org> http://fetter.org/
Phone: +1 415 235 3778

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate