Speedup usages of pg_*toa() functions

Started by David Rowleyover 5 years ago19 messages
#1David Rowley
dgrowleyml@gmail.com
1 attachment(s)

Hi,

pg_itoa, pg_ltoa and pg_lltoa all have access to the length of the
string that is produced in the function by way of the "len" variable.
These functions don't have a great deal of use in core, but it seems
that most callers do require the len but end up getting it via
strlen(). It seems we could optimise this a little if we just had the
functions return the length instead of making callers do the work
themselves.

This allows us to speed up a few cases. int2vectorout() should be
faster and int8out() becomes a bit faster if we get rid of the
strdup() call and replace it with a palloc()/memcpy() call.

The slight drawback that I can see from this is that on testing
int4out() it gets slightly slower, which I assume is because I'm now
returning the length, but there's no use for it in that function.

create table bi (a bigint);
insert into bi select generate_Series(1,10000000);
vacuum freeze analyze bi;

bench.sql = copy bi to '/dev/null';

BIGINT test

drowley@amd3990x:~$ pgbench -n -f bench.sql -T 120 postgres

Master: latency average = 1791.597 ms
Patched: latency average = 1705.322 ms (95.184%)

INT test

create table i (a int);
insert into i select generate_Series(1,10000000);
vacuum freeze analyze i;

bench.sql = copy i to '/dev/null';

drowley@amd3990x:~$ pgbench -n -f bench.sql -T 120 postgres

Master: latency average = 1631.956 ms
Patched: latency average = 1678.626 ms (102.859%)

As you can see, this squeezes about 5% extra out of a copy of a 10
million row bigint table but costs us almost 3% on an equivalent int
table. A likely workaround for that is moving the functions into the
header file and making them static inline. It would be nice not to
have to do that though.

These tests were done on modern AMD hardware. I've not tested yet on
anything intel based.

I've copied in Andrew as I know he only recently rewrote these
functions and Andres since he did mention this in [1]/messages/by-id/20190920051857.2fhnvhvx4qdddviz@alap3.anarazel.de.

I'm interested to know if that int4out regression exists on other hardware.

David

[1]: /messages/by-id/20190920051857.2fhnvhvx4qdddviz@alap3.anarazel.de

Attachments:

return_strlen_num_in_numutils.patchapplication/octet-stream; name=return_strlen_num_in_numutils.patchDownload
diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c
index 0f0b54bdae..df27700df9 100644
--- a/src/backend/access/common/printsimple.c
+++ b/src/backend/access/common/printsimple.c
@@ -103,9 +103,10 @@ printsimple(TupleTableSlot *slot, DestReceiver *self)
 				{
 					int32		num = DatumGetInt32(value);
 					char		str[12];	/* sign, 10 digits and '\0' */
+					int			len;
 
-					pg_ltoa(num, str);
-					pq_sendcountedtext(&buf, str, strlen(str), false);
+					len = pg_ltoa(num, str);
+					pq_sendcountedtext(&buf, str, len, false);
 				}
 				break;
 
@@ -113,9 +114,10 @@ printsimple(TupleTableSlot *slot, DestReceiver *self)
 				{
 					int64		num = DatumGetInt64(value);
 					char		str[MAXINT8LEN + 1];
+					int			len;
 
-					pg_lltoa(num, str);
-					pq_sendcountedtext(&buf, str, strlen(str), false);
+					len = pg_lltoa(num, str);
+					pq_sendcountedtext(&buf, str, len, false);
 				}
 				break;
 
diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c
index 63c59c56b3..418c13e1b4 100644
--- a/src/backend/utils/adt/int.c
+++ b/src/backend/utils/adt/int.c
@@ -191,9 +191,7 @@ int2vectorout(PG_FUNCTION_ARGS)
 	{
 		if (num != 0)
 			*rp++ = ' ';
-		pg_itoa(int2Array->values[num], rp);
-		while (*++rp != '\0')
-			;
+		rp += pg_itoa(int2Array->values[num], rp);
 	}
 	*rp = '\0';
 	PG_RETURN_CSTRING(result);
diff --git a/src/backend/utils/adt/int8.c b/src/backend/utils/adt/int8.c
index abba8f1df0..0f59c004cc 100644
--- a/src/backend/utils/adt/int8.c
+++ b/src/backend/utils/adt/int8.c
@@ -149,9 +149,12 @@ int8out(PG_FUNCTION_ARGS)
 	int64		val = PG_GETARG_INT64(0);
 	char		buf[MAXINT8LEN + 1];
 	char	   *result;
+	int			len;
 
-	pg_lltoa(val, buf);
-	result = pstrdup(buf);
+	len = pg_lltoa(val, buf) + 1;
+	/* avoid strlen overhead of pstrdup and pnstrdup by copying manually */
+	result = palloc(len);
+	memcpy(result, buf, len);
 	PG_RETURN_CSTRING(result);
 }
 
diff --git a/src/backend/utils/adt/numutils.c b/src/backend/utils/adt/numutils.c
index f4f76845a7..84fc743573 100644
--- a/src/backend/utils/adt/numutils.c
+++ b/src/backend/utils/adt/numutils.c
@@ -327,16 +327,17 @@ invalid_syntax:
 
 /*
  * pg_itoa: converts a signed 16-bit integer to its string representation
+ * and returns strlen(a).
  *
  * Caller must ensure that 'a' points to enough memory to hold the result
  * (at least 7 bytes, counting a leading sign and trailing NUL).
  *
  * It doesn't seem worth implementing this separately.
  */
-void
+int
 pg_itoa(int16 i, char *a)
 {
-	pg_ltoa((int32) i, a);
+	return pg_ltoa((int32) i, a);
 }
 
 /*
@@ -404,13 +405,14 @@ pg_ultoa_n(uint32 value, char *a)
 }
 
 /*
- * NUL-terminate the output of pg_ultoa_n.
+ * pg_ltoa: converts a signed 32-bit integer to its string representation and
+ * returns strlen(a).
  *
  * It is the caller's responsibility to ensure that a is at least 12 bytes long,
  * which is enough room to hold a minus sign, a maximally long int32, and the
  * above terminating NUL.
  */
-void
+int
 pg_ltoa(int32 value, char *a)
 {
 
@@ -421,9 +423,13 @@ pg_ltoa(int32 value, char *a)
 	{
 		uvalue = (uint32) 0 - uvalue;
 		*a++ = '-';
+		len = pg_ultoa_n(uvalue, a);
+		a[len] = '\0';
+		return len + 1;
 	}
 	len = pg_ultoa_n(uvalue, a);
 	a[len] = '\0';
+	return len;
 }
 
 /*
@@ -512,12 +518,13 @@ pg_ulltoa_n(uint64 value, char *a)
 }
 
 /*
- * pg_lltoa: convert a signed 64-bit integer to its string representation
+ * pg_lltoa: converts a signed 64-bit integer to its string representation and
+ * returns strlen(a).
  *
  * Caller must ensure that 'a' points to enough memory to hold the result
  * (at least MAXINT8LEN + 1 bytes, counting a leading sign and trailing NUL).
  */
-void
+int
 pg_lltoa(int64 value, char *a)
 {
 	int			len;
@@ -527,9 +534,14 @@ pg_lltoa(int64 value, char *a)
 	{
 		*a++ = '-';
 		uvalue = (uint64) 0 - uvalue;
+		len = pg_ulltoa_n(uvalue, a);
+		a[len] = '\0';
+		return len + 1;
 	}
+
 	len = pg_ulltoa_n(uvalue, a);
-	a[len] = 0;
+	a[len] = '\0';
+	return len;
 }
 
 
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index a352a8b773..019c150f22 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -47,11 +47,11 @@ extern int	namestrcmp(Name name, const char *str);
 extern int32 pg_atoi(const char *s, int size, int c);
 extern int16 pg_strtoint16(const char *s);
 extern int32 pg_strtoint32(const char *s);
-extern void pg_itoa(int16 i, char *a);
+extern int pg_itoa(int16 i, char *a);
 int			pg_ultoa_n(uint32 l, char *a);
 int			pg_ulltoa_n(uint64 l, char *a);
-extern void pg_ltoa(int32 l, char *a);
-extern void pg_lltoa(int64 ll, char *a);
+extern int pg_ltoa(int32 l, char *a);
+extern int pg_lltoa(int64 ll, char *a);
 extern char *pg_ultostr_zeropad(char *str, uint32 value, int32 minwidth);
 extern char *pg_ultostr(char *str, uint32 value);
 extern uint64 pg_strtouint64(const char *str, char **endptr, int base);
#2Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: David Rowley (#1)
Re: Speedup usages of pg_*toa() functions

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> As you can see, this squeezes about 5% extra out of a copy of a
David> 10 million row bigint table but costs us almost 3% on an
David> equivalent int table.

And once again I have to issue the reminder: you can have gains or
losses of several percent on microbenchmarks of this kind just by
touching unrelated pieces of code that are never used in the test. In
order to demonstrate a consistent difference, you have to do each set of
tests multiple times, with random amounts of padding added to some
unrelated part of the code.

--
Andrew (irc:RhodiumToad)

#3David Rowley
dgrowleyml@gmail.com
In reply to: Andrew Gierth (#2)
Re: Speedup usages of pg_*toa() functions

On Tue, 9 Jun 2020 at 19:24, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> As you can see, this squeezes about 5% extra out of a copy of a
David> 10 million row bigint table but costs us almost 3% on an
David> equivalent int table.

And once again I have to issue the reminder: you can have gains or
losses of several percent on microbenchmarks of this kind just by
touching unrelated pieces of code that are never used in the test. In
order to demonstrate a consistent difference, you have to do each set of
tests multiple times, with random amounts of padding added to some
unrelated part of the code.

Thanks for the reminder.

Instead of that, I tried with clang 10.0.0. I was previously using gcc 9.3.

BIGINT test

Master: latency average = 1842.182 ms
Patched: latency average = 1715.418 ms

INT test

Master: latency average = 1650.583 ms
Patched: latency average = 1617.783 ms

There's nothing in the patch that makes the INT test faster, so I
guess that's noise. The BIGINT test is about 7.3% faster in this
case.

David

#4Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: David Rowley (#1)
Re: Speedup usages of pg_*toa() functions

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> This allows us to speed up a few cases. int2vectorout() should
David> be faster and int8out() becomes a bit faster if we get rid of
David> the strdup() call and replace it with a palloc()/memcpy() call.

What about removing the memcpy entirely? I don't think we save anything
much useful here by pallocing the exact length, rather than doing what
int4out does and palloc a fixed size and convert the int directly into
it.

i.e.

Datum
int8out(PG_FUNCTION_ARGS)
{
int64 val = PG_GETARG_INT64(0);
char *result = palloc(MAXINT8LEN + 1);

pg_lltoa(val, result);
PG_RETURN_CSTRING(result);
}

For pg_ltoa, etc., I don't like adding the extra call to pg_ultoa_n - at
least on my clang, that results in two copies of pg_ultoa_n inlined.
How about doing it like,

int
pg_lltoa(int64 value, char *a)
{
int len = 0;
uint64 uvalue = value;

if (value < 0)
{
uvalue = (uint64) 0 - uvalue;
a[len++] = '-';
}
len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';
return len;
}

--
Andrew (irc:RhodiumToad)

#5David Rowley
dgrowleyml@gmail.com
In reply to: Andrew Gierth (#4)
Re: Speedup usages of pg_*toa() functions

On Tue, 9 Jun 2020 at 22:08, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> This allows us to speed up a few cases. int2vectorout() should
David> be faster and int8out() becomes a bit faster if we get rid of
David> the strdup() call and replace it with a palloc()/memcpy() call.

What about removing the memcpy entirely? I don't think we save anything
much useful here by pallocing the exact length, rather than doing what
int4out does and palloc a fixed size and convert the int directly into
it.

On looking back through git blame, it seems int2out and int4out have
been that way since at least 1996, before int8.c existed. int8out has
been doing it since fa838876e9f -- Include 8-byte integer type. dated
1998. Quite likely the larger than required palloc size back then was
more of a concern. So perhaps you're right about just doing it that
way instead. With that and the ints I tested with, the int8
performance should be about aligned to int4 performance.

For pg_ltoa, etc., I don't like adding the extra call to pg_ultoa_n - at
least on my clang, that results in two copies of pg_ultoa_n inlined.
How about doing it like,

int
pg_lltoa(int64 value, char *a)
{
int len = 0;
uint64 uvalue = value;

if (value < 0)
{
uvalue = (uint64) 0 - uvalue;
a[len++] = '-';
}
len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';
return len;
}

Agreed, that seems better.

David

#6Ranier Vilela
ranier.vf@gmail.com
In reply to: David Rowley (#5)
Re: Speedup usages of pg_*toa() functions

Em ter., 9 de jun. de 2020 às 07:55, David Rowley <dgrowleyml@gmail.com>
escreveu:

On Tue, 9 Jun 2020 at 22:08, Andrew Gierth <andrew@tao11.riddles.org.uk>
wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> This allows us to speed up a few cases. int2vectorout() should
David> be faster and int8out() becomes a bit faster if we get rid of
David> the strdup() call and replace it with a palloc()/memcpy() call.

What about removing the memcpy entirely? I don't think we save anything
much useful here by pallocing the exact length, rather than doing what
int4out does and palloc a fixed size and convert the int directly into
it.

On looking back through git blame, it seems int2out and int4out have
been that way since at least 1996, before int8.c existed. int8out has
been doing it since fa838876e9f -- Include 8-byte integer type. dated
1998. Quite likely the larger than required palloc size back then was
more of a concern. So perhaps you're right about just doing it that
way instead. With that and the ints I tested with, the int8
performance should be about aligned to int4 performance.

For pg_ltoa, etc., I don't like adding the extra call to pg_ultoa_n - at
least on my clang, that results in two copies of pg_ultoa_n inlined.
How about doing it like,

int
pg_lltoa(int64 value, char *a)
{
int len = 0;
uint64 uvalue = value;

if (value < 0)
{
uvalue = (uint64) 0 - uvalue;
a[len++] = '-';
}
len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';
return len;
}

Written like that, wouldn't it get better?

int
pg_lltoa(int64 value, char *a)
{
if (value < 0)
{
int len = 0;
uint64 uvalue = (uint64) 0 - uvalue;

a[len++] = '-';
len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';
return len;
}
else
return pg_ulltoa_n(value, a);
}

regards,
Ranier Vilela

#7Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: Ranier Vilela (#6)
Re: Speedup usages of pg_*toa() functions

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> Written like that, wouldn't it get better?

Ranier> int
Ranier> pg_lltoa(int64 value, char *a)
Ranier> {
Ranier> if (value < 0)
Ranier> {
Ranier> int len = 0;
Ranier> uint64 uvalue = (uint64) 0 - uvalue;
Ranier> a[len++] = '-';
Ranier> len += pg_ulltoa_n(uvalue, a + len);
Ranier> a[len] = '\0';
Ranier> return len;
Ranier> }
Ranier> else
Ranier> return pg_ulltoa_n(value, a);
Ranier> }

No. While it doesn't matter so much for pg_lltoa since that's unlikely
to inline multiple pg_ulltoa_n calls, if you do pg_ltoa like this it (a)
ends up with two copies of pg_ultoa_n inlined into it, and (b) you don't
actually save any useful amount of time. Your version is also failing to
add the terminating '\0' for the positive case and has other obvious
bugs.

--
Andrew (irc:RhodiumToad)

#8Ranier Vilela
ranier.vf@gmail.com
In reply to: Andrew Gierth (#7)
Re: Speedup usages of pg_*toa() functions

Em ter., 9 de jun. de 2020 às 13:01, Andrew Gierth <
andrew@tao11.riddles.org.uk> escreveu:

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> Written like that, wouldn't it get better?

Ranier> int
Ranier> pg_lltoa(int64 value, char *a)
Ranier> {
Ranier> if (value < 0)
Ranier> {
Ranier> int len = 0;
Ranier> uint64 uvalue = (uint64) 0 - uvalue;
Ranier> a[len++] = '-';
Ranier> len += pg_ulltoa_n(uvalue, a + len);
Ranier> a[len] = '\0';
Ranier> return len;
Ranier> }
Ranier> else
Ranier> return pg_ulltoa_n(value, a);
Ranier> }

No. While it doesn't matter so much for pg_lltoa since that's unlikely
to inline multiple pg_ulltoa_n calls, if you do pg_ltoa like this it (a)
ends up with two copies of pg_ultoa_n inlined into it, and (b) you don't
actually save any useful amount of time. Your version is also failing to
add the terminating '\0' for the positive case and has other obvious
bugs.

(a) Sorry, I'm not asm specialist.

#include <stdio.h>

int pg_utoa(unsigned int num, char * a) {
int len;

len = sprintf(a, "%lu", num);

return len;
}

int pg_toa(int num, char * a)
{
if (num < 0) {
int len;
len = pg_utoa(num, a);
a[len] = '\0';
return len;
}
else
return pg_utoa(num, a);
}

.LC0:
.string "%lu"
pg_utoa(unsigned int, char*):
mov edx, edi
xor eax, eax
mov rdi, rsi
mov esi, OFFSET FLAT:.LC0
jmp sprintf
pg_toa(int, char*):
push rbp
test edi, edi
mov rbp, rsi
mov edx, edi
mov esi, OFFSET FLAT:.LC0
mov rdi, rbp
mov eax, 0
js .L7
pop rbp
jmp sprintf
.L7:
call sprintf
movsx rdx, eax
mov BYTE PTR [rbp+0+rdx], 0
pop rbp
ret

Where " ends up with two copies of pg_ultoa_n inlined into it", in this
simplified example?

(b) I call this tail cut, I believe it saves time, for sure.

Regarding bugs:

(c) your version don't check size of a var, when pg_ulltoa_n
warning about "least MAXINT8LEN bytes."

So in theory, I could blow it up, by calling pg_lltoa.

(d) So I can't trust pg_ulltoa_n, when var a, is it big enough?
If not, there are more bugs.

regards,
Ranier Vilela

#9Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: Ranier Vilela (#8)
Re: Speedup usages of pg_*toa() functions

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> Where " ends up with two copies of pg_ultoa_n inlined into it",
Ranier> in this simplified example?

The two references to sprintf are both inlined copies of your pg_utoa.

Ranier> (b) I call this tail cut, I believe it saves time, for sure.

You seem to have missed the point that the pg_ultoa_n / pg_ulltoa_n
functions DO NOT ADD A TRAILING NUL. Which means that pg_ltoa / pg_lltoa
can't just tail call them, since they must add the NUL after.

Ranier> Regarding bugs:

Ranier> (c) your version don't check size of a var, when pg_ulltoa_n
Ranier> warning about "least MAXINT8LEN bytes."

Ranier> So in theory, I could blow it up, by calling pg_lltoa.

No. Callers of pg_lltoa are required to provide a buffer of at least
MAXINT8LEN+1 bytes.

--
Andrew.

#10Ranier Vilela
ranier.vf@gmail.com
In reply to: Andrew Gierth (#9)
Re: Speedup usages of pg_*toa() functions

Em ter., 9 de jun. de 2020 às 15:53, Andrew Gierth <
andrew@tao11.riddles.org.uk> escreveu:

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> Where " ends up with two copies of pg_ultoa_n inlined into it",
Ranier> in this simplified example?

The two references to sprintf are both inlined copies of your pg_utoa.

Ranier> (b) I call this tail cut, I believe it saves time, for sure.

You seem to have missed the point that the pg_ultoa_n / pg_ulltoa_n
functions DO NOT ADD A TRAILING NUL. Which means that pg_ltoa / pg_lltoa
can't just tail call them, since they must add the NUL after.

Ranier> Regarding bugs:

Ranier> (c) your version don't check size of a var, when pg_ulltoa_n
Ranier> warning about "least MAXINT8LEN bytes."

Ranier> So in theory, I could blow it up, by calling pg_lltoa.

No. Callers of pg_lltoa are required to provide a buffer of at least
MAXINT8LEN+1 bytes.

Thanks for explanations.

So I would change, just the initialization (var uvalue), even though it is
irrelevant.

int
pg_lltoa(int64 value, char *a)
{
int len = 0;
uint64 uvalue;

if (value < 0)
{
uvalue = (uint64) 0 - uvalue;
a[len++] = '-';
}
else
uvalue = value;

len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';

return len;
}

regards,
Ranier Vilela

#11Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: Ranier Vilela (#10)
Re: Speedup usages of pg_*toa() functions

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> So I would change, just the initialization (var uvalue), even though it is
Ranier> irrelevant.

Ranier> int
Ranier> pg_lltoa(int64 value, char *a)
Ranier> {
Ranier> int len = 0;
Ranier> uint64 uvalue;

Ranier> if (value < 0)
Ranier> {
Ranier> uvalue = (uint64) 0 - uvalue;

Use of uninitialized variable.

--
Andrew (irc:RhodiumToad)

#12Ranier Vilela
ranier.vf@gmail.com
In reply to: Andrew Gierth (#11)
Re: Speedup usages of pg_*toa() functions

Em ter., 9 de jun. de 2020 às 17:42, Andrew Gierth <
andrew@tao11.riddles.org.uk> escreveu:

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> So I would change, just the initialization (var uvalue), even
though it is
Ranier> irrelevant.

Ranier> int
Ranier> pg_lltoa(int64 value, char *a)
Ranier> {
Ranier> int len = 0;
Ranier> uint64 uvalue;

Ranier> if (value < 0)
Ranier> {
Ranier> uvalue = (uint64) 0 - uvalue;

Use of uninitialized variable.

Sorry, my mistake.

uvalue = (uint64) 0 - value;

regards,
Ranier Vilela

Show quoted text

--
Andrew (irc:RhodiumToad)

#13David Rowley
dgrowleyml@gmail.com
In reply to: Andrew Gierth (#4)
2 attachment(s)
Re: Speedup usages of pg_*toa() functions

On Tue, 9 Jun 2020 at 22:08, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> This allows us to speed up a few cases. int2vectorout() should
David> be faster and int8out() becomes a bit faster if we get rid of
David> the strdup() call and replace it with a palloc()/memcpy() call.

What about removing the memcpy entirely? I don't think we save anything
much useful here by pallocing the exact length, rather than doing what
int4out does and palloc a fixed size and convert the int directly into
it.

The attached 0001 patch does this.

create table bi (a bigint);
insert into bi select generate_Series(1,10000000);
vacuum freeze analyze bi;

query = copy bi to '/dev/null';
120 second pgbench run.

The results are:

GCC master: latency average = 1757.556 ms
GCC master+0001: latency average = 1588.793 ms (90.4%)

clang master: latency average = 1818.952 ms
clang master+0001: latency average = 1649.100 ms (90.6%)

For pg_ltoa, etc., I don't like adding the extra call to pg_ultoa_n - at
least on my clang, that results in two copies of pg_ultoa_n inlined.
How about doing it like,

int
pg_lltoa(int64 value, char *a)
{
int len = 0;
uint64 uvalue = value;

if (value < 0)
{
uvalue = (uint64) 0 - uvalue;
a[len++] = '-';
}
len += pg_ulltoa_n(uvalue, a + len);
a[len] = '\0';
return len;
}

The 0002 patch does it this way.

David

Attachments:

0001-Improve-performance-of-int8out.patchapplication/octet-stream; name=0001-Improve-performance-of-int8out.patchDownload
From 4bacc6156b29b1f0fd34bc1d5a895efd9aa0d495 Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Wed, 10 Jun 2020 11:00:26 +1200
Subject: [PATCH 1/2] Improve performance of int8out

Previously we wrote the string to a local buffer and then pstrdup'd that.
However, it seems reasonable that we just palloc enough space for the
largest of numbers and write the string directly to that instead.  This
saves us a strlen() and memcpy() call, which results in some
demonstratable speedups of COPY from a table with a BIGINT column.
---
 src/backend/utils/adt/int8.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/backend/utils/adt/int8.c b/src/backend/utils/adt/int8.c
index abba8f1df0..cb666b80b9 100644
--- a/src/backend/utils/adt/int8.c
+++ b/src/backend/utils/adt/int8.c
@@ -147,11 +147,9 @@ Datum
 int8out(PG_FUNCTION_ARGS)
 {
 	int64		val = PG_GETARG_INT64(0);
-	char		buf[MAXINT8LEN + 1];
-	char	   *result;
+	char	   *result = (char *) palloc(MAXINT8LEN + 1);
 
-	pg_lltoa(val, buf);
-	result = pstrdup(buf);
+	pg_lltoa(val, result);
 	PG_RETURN_CSTRING(result);
 }
 
-- 
2.25.1

0002-Have-pg_itoa-pg_ltoa-and-pg_lltoa-return-the-length-.patchapplication/octet-stream; name=0002-Have-pg_itoa-pg_ltoa-and-pg_lltoa-return-the-length-.patchDownload
From 76ef2ae62de3a65f7dc1b9e32efa8bd032eb7eee Mon Sep 17 00:00:00 2001
From: "dgrowley@gmail.com" <dgrowley@gmail.com>
Date: Wed, 10 Jun 2020 11:06:17 +1200
Subject: [PATCH 2/2] Have pg_itoa, pg_ltoa and pg_lltoa return the length of
 the string

Core by no means makes excessive use of these functions, but quite a large
number of those usages do require the caller to call strlen() on the
string.  This is quite wasteful since these functions do have a good idea
of the length of the string already, so we might as well just have them
return that.
---
 src/backend/access/common/printsimple.c | 10 ++++----
 src/backend/utils/adt/int.c             |  4 +---
 src/backend/utils/adt/numutils.c        | 32 +++++++++++++++----------
 src/include/utils/builtins.h            |  6 ++---
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c
index 0f0b54bdae..df27700df9 100644
--- a/src/backend/access/common/printsimple.c
+++ b/src/backend/access/common/printsimple.c
@@ -103,9 +103,10 @@ printsimple(TupleTableSlot *slot, DestReceiver *self)
 				{
 					int32		num = DatumGetInt32(value);
 					char		str[12];	/* sign, 10 digits and '\0' */
+					int			len;
 
-					pg_ltoa(num, str);
-					pq_sendcountedtext(&buf, str, strlen(str), false);
+					len = pg_ltoa(num, str);
+					pq_sendcountedtext(&buf, str, len, false);
 				}
 				break;
 
@@ -113,9 +114,10 @@ printsimple(TupleTableSlot *slot, DestReceiver *self)
 				{
 					int64		num = DatumGetInt64(value);
 					char		str[MAXINT8LEN + 1];
+					int			len;
 
-					pg_lltoa(num, str);
-					pq_sendcountedtext(&buf, str, strlen(str), false);
+					len = pg_lltoa(num, str);
+					pq_sendcountedtext(&buf, str, len, false);
 				}
 				break;
 
diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c
index 63c59c56b3..418c13e1b4 100644
--- a/src/backend/utils/adt/int.c
+++ b/src/backend/utils/adt/int.c
@@ -191,9 +191,7 @@ int2vectorout(PG_FUNCTION_ARGS)
 	{
 		if (num != 0)
 			*rp++ = ' ';
-		pg_itoa(int2Array->values[num], rp);
-		while (*++rp != '\0')
-			;
+		rp += pg_itoa(int2Array->values[num], rp);
 	}
 	*rp = '\0';
 	PG_RETURN_CSTRING(result);
diff --git a/src/backend/utils/adt/numutils.c b/src/backend/utils/adt/numutils.c
index 13877fdc1d..608539b0ce 100644
--- a/src/backend/utils/adt/numutils.c
+++ b/src/backend/utils/adt/numutils.c
@@ -327,16 +327,17 @@ invalid_syntax:
 
 /*
  * pg_itoa: converts a signed 16-bit integer to its string representation
+ * and returns strlen(a).
  *
  * Caller must ensure that 'a' points to enough memory to hold the result
  * (at least 7 bytes, counting a leading sign and trailing NUL).
  *
  * It doesn't seem worth implementing this separately.
  */
-void
+int
 pg_itoa(int16 i, char *a)
 {
-	pg_ltoa((int32) i, a);
+	return pg_ltoa((int32) i, a);
 }
 
 /*
@@ -404,26 +405,28 @@ pg_ultoa_n(uint32 value, char *a)
 }
 
 /*
- * NUL-terminate the output of pg_ultoa_n.
+ * pg_ltoa: converts a signed 32-bit integer to its string representation and
+ * returns strlen(a).
  *
  * It is the caller's responsibility to ensure that a is at least 12 bytes long,
  * which is enough room to hold a minus sign, a maximally long int32, and the
  * above terminating NUL.
  */
-void
+int
 pg_ltoa(int32 value, char *a)
 {
 
 	uint32		uvalue = (uint32) value;
-	int			len;
+	int			len = 0;
 
 	if (value < 0)
 	{
 		uvalue = (uint32) 0 - uvalue;
-		*a++ = '-';
+		a[len++] = '-';
 	}
-	len = pg_ultoa_n(uvalue, a);
+	len += pg_ultoa_n(uvalue, a + len);
 	a[len] = '\0';
+	return len;
 }
 
 /*
@@ -512,24 +515,27 @@ pg_ulltoa_n(uint64 value, char *a)
 }
 
 /*
- * pg_lltoa: convert a signed 64-bit integer to its string representation
+ * pg_lltoa: converts a signed 64-bit integer to its string representation and
+ * returns strlen(a).
  *
  * Caller must ensure that 'a' points to enough memory to hold the result
  * (at least MAXINT8LEN + 1 bytes, counting a leading sign and trailing NUL).
  */
-void
+int
 pg_lltoa(int64 value, char *a)
 {
-	int			len;
+	int			len = 0;
 	uint64		uvalue = value;
 
 	if (value < 0)
 	{
-		*a++ = '-';
 		uvalue = (uint64) 0 - uvalue;
+		a[len++] = '-';
 	}
-	len = pg_ulltoa_n(uvalue, a);
-	a[len] = 0;
+
+	len += pg_ulltoa_n(uvalue, a + len);
+	a[len] = '\0';
+	return len;
 }
 
 
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index a352a8b773..019c150f22 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -47,11 +47,11 @@ extern int	namestrcmp(Name name, const char *str);
 extern int32 pg_atoi(const char *s, int size, int c);
 extern int16 pg_strtoint16(const char *s);
 extern int32 pg_strtoint32(const char *s);
-extern void pg_itoa(int16 i, char *a);
+extern int pg_itoa(int16 i, char *a);
 int			pg_ultoa_n(uint32 l, char *a);
 int			pg_ulltoa_n(uint64 l, char *a);
-extern void pg_ltoa(int32 l, char *a);
-extern void pg_lltoa(int64 ll, char *a);
+extern int pg_ltoa(int32 l, char *a);
+extern int pg_lltoa(int64 ll, char *a);
 extern char *pg_ultostr_zeropad(char *str, uint32 value, int32 minwidth);
 extern char *pg_ultostr(char *str, uint32 value);
 extern uint64 pg_strtouint64(const char *str, char **endptr, int base);
-- 
2.25.1

#14Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: Ranier Vilela (#12)
Re: Speedup usages of pg_*toa() functions

"Ranier" == Ranier Vilela <ranier.vf@gmail.com> writes:

Ranier> Sorry, my mistake.

Ranier> uvalue = (uint64) 0 - value;

This doesn't gain anything over the original, and it has the downside of
hiding an int64 to uint64 conversion that is actually quite sensitive.
For example, it might tempt someone to rewrite it as

uvalue = -value;

which is actually incorrect (though our -fwrapv will hide the error).

--
Andrew (irc:RhodiumToad)

#15David Rowley
dgrowleyml@gmail.com
In reply to: David Rowley (#13)
Re: Speedup usages of pg_*toa() functions

On Wed, 10 Jun 2020 at 11:57, David Rowley <dgrowleyml@gmail.com> wrote:

On Tue, 9 Jun 2020 at 22:08, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> This allows us to speed up a few cases. int2vectorout() should
David> be faster and int8out() becomes a bit faster if we get rid of
David> the strdup() call and replace it with a palloc()/memcpy() call.

What about removing the memcpy entirely? I don't think we save anything
much useful here by pallocing the exact length, rather than doing what
int4out does and palloc a fixed size and convert the int directly into
it.

The attached 0001 patch does this.

Pending any objections, I'd like to push both of these patches in the
next few days to master.

Anyone object to changing the signature of these functions in 0002, or
have concerns about allocating the maximum memory that we might
require in int8out()?

David

#16Andrew Gierth
andrew@tao11.riddles.org.uk
In reply to: David Rowley (#15)
Re: Speedup usages of pg_*toa() functions

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> Pending any objections, I'd like to push both of these patches
David> in the next few days to master.

For the second patch, can we take the opportunity to remove the
extraneous blank line at the top of pg_ltoa, and add the two missing
"extern"s in builtins.h for pg_ultoa_n and pg_ulltoa_n ?

David> Anyone object to changing the signature of these functions in
David> 0002, or have concerns about allocating the maximum memory that
David> we might require in int8out()?

Changing the function signatures seems safe enough. The memory thing
only seems likely to be an issue if you allocate a lot of text strings
for bigint values without a context reset, and I'm not sure where that
would happen (maybe passing large bigint arrays to pl/perl or pl/python
would do it?)

--
Andrew (irc:RhodiumToad)

#17David Rowley
dgrowleyml@gmail.com
In reply to: Andrew Gierth (#16)
Re: Speedup usages of pg_*toa() functions

On Thu, 11 Jun 2020 at 18:52, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

For the second patch, can we take the opportunity to remove the
extraneous blank line at the top of pg_ltoa, and add the two missing
"extern"s in builtins.h for pg_ultoa_n and pg_ulltoa_n ?

I think since we've branched for PG14 now that fixing those should be
backpatched to PG13.

David

#18David Rowley
dgrowleyml@gmail.com
In reply to: Andrew Gierth (#16)
Re: Speedup usages of pg_*toa() functions

On Thu, 11 Jun 2020 at 18:52, Andrew Gierth <andrew@tao11.riddles.org.uk> wrote:

"David" == David Rowley <dgrowleyml@gmail.com> writes:

David> Pending any objections, I'd like to push both of these patches
David> in the next few days to master.

For the second patch, can we take the opportunity to remove the
extraneous blank line at the top of pg_ltoa, and add the two missing
"extern"s in builtins.h for pg_ultoa_n and pg_ulltoa_n ?

David> Anyone object to changing the signature of these functions in
David> 0002, or have concerns about allocating the maximum memory that
David> we might require in int8out()?

Changing the function signatures seems safe enough. The memory thing
only seems likely to be an issue if you allocate a lot of text strings
for bigint values without a context reset, and I'm not sure where that
would happen (maybe passing large bigint arrays to pl/perl or pl/python
would do it?)

I ended up chickening out of doing the larger allocation
unconditionally. Instead, I pushed the original idea of doing the
palloc/memcpy of the length returned by pg_lltoa. That gets us most
of the gains without the change in memory usage behaviour.

Thanks for your reviews on this.

David

#19John Naylor
john.naylor@2ndquadrant.com
In reply to: David Rowley (#18)
Re: Speedup usages of pg_*toa() functions

On Sat, Jun 13, 2020 at 8:36 AM David Rowley <dgrowleyml@gmail.com> wrote:

I ended up chickening out of doing the larger allocation
unconditionally. Instead, I pushed the original idea of doing the
palloc/memcpy of the length returned by pg_lltoa. That gets us most
of the gains without the change in memory usage behaviour.

This was still marked as needing review in commitfest, so I marked it
as committed.

--
John Naylor https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services