Add pg_strtoupper and pg_strtolower functions
Hi,
I came across pg_toupper and pg_tolower functions, converting a single
character, are being used in loops to convert an entire
null-terminated string. The cost of calling these character-based
conversion functions (even though small) can be avoided if we have two
new functions pg_strtoupper and pg_strtolower.
Attaching a patch with these new two functions and their usage in most
of the possible places in the code.
Thoughts?
Regards,
Bharath Rupireddy.
Attachments:
v1-0001-Add-pg_strtoupper-and-pg_strtolower-functions.patchapplication/octet-stream; name=v1-0001-Add-pg_strtoupper-and-pg_strtolower-functions.patchDownload
From c5e9d410133f6174ecf2a89ebc6adda588d2359d Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Mon, 2 May 2022 12:35:21 +0000
Subject: [PATCH v1] Add pg_strtoupper and pg_strtolower functions
---
src/backend/utils/adt/datetime.c | 8 ++----
src/backend/utils/misc/tzparser.c | 5 +---
src/bin/psql/tab-complete.c | 9 ++----
src/include/port.h | 2 ++
src/interfaces/ecpg/pgtypeslib/datetime.c | 3 +-
src/port/pgstrcasecmp.c | 34 +++++++++++++++++++++++
src/timezone/pgtz.c | 7 ++---
7 files changed, 45 insertions(+), 23 deletions(-)
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index 4c12c4d663..94a24c733f 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -1884,13 +1884,11 @@ DetermineTimeZoneAbbrevOffsetInternal(pg_time_t t, const char *abbr, pg_tz *tzp,
int *offset, int *isdst)
{
char upabbr[TZ_STRLEN_MAX + 1];
- unsigned char *p;
long int gmtoff;
/* We need to force the abbrev to upper case */
strlcpy(upabbr, abbr, sizeof(upabbr));
- for (p = (unsigned char *) upabbr; *p; p++)
- *p = pg_toupper(*p);
+ pg_strtoupper(upabbr);
/* Look up the abbrev's meaning at this time in this zone */
if (pg_interpret_timezone_abbrev(upabbr,
@@ -4929,7 +4927,6 @@ pg_timezone_abbrevs(PG_FUNCTION_ARGS)
char buffer[TOKMAXLEN + 1];
int gmtoffset;
bool is_dst;
- unsigned char *p;
struct pg_itm_in itm_in;
Interval *resInterval;
@@ -5018,8 +5015,7 @@ pg_timezone_abbrevs(PG_FUNCTION_ARGS)
* what ParseDateTime() uses.
*/
strlcpy(buffer, tp->token, sizeof(buffer));
- for (p = (unsigned char *) buffer; *p; p++)
- *p = pg_toupper(*p);
+ pg_strtoupper(buffer);
values[0] = CStringGetTextDatum(buffer);
diff --git a/src/backend/utils/misc/tzparser.c b/src/backend/utils/misc/tzparser.c
index a69cb2d268..18b7ca45bc 100644
--- a/src/backend/utils/misc/tzparser.c
+++ b/src/backend/utils/misc/tzparser.c
@@ -50,8 +50,6 @@ static int ParseTzFile(const char *filename, int depth,
static bool
validateTzEntry(tzEntry *tzentry)
{
- unsigned char *p;
-
/*
* Check restrictions imposed by datetktbl storage format (see datetime.c)
*/
@@ -78,8 +76,7 @@ validateTzEntry(tzEntry *tzentry)
/*
* Convert abbrev to lowercase (must match datetime.c's conversion)
*/
- for (p = (unsigned char *) tzentry->abbrev; *p; p++)
- *p = pg_tolower(*p);
+ pg_strtolower(tzentry->abbrev);
return true;
}
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 588c0841fe..3beefb7e05 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -5587,8 +5587,7 @@ complete_from_files(const char *text, int state)
static char *
pg_strdup_keyword_case(const char *s, const char *ref)
{
- char *ret,
- *p;
+ char *ret;
unsigned char first = ref[0];
ret = pg_strdup(s);
@@ -5598,13 +5597,11 @@ pg_strdup_keyword_case(const char *s, const char *ref)
pset.comp_case == PSQL_COMP_CASE_PRESERVE_UPPER) && islower(first)) ||
(pset.comp_case == PSQL_COMP_CASE_PRESERVE_LOWER && !isalpha(first)))
{
- for (p = ret; *p; p++)
- *p = pg_tolower((unsigned char) *p);
+ pg_strtolower(ret);
}
else
{
- for (p = ret; *p; p++)
- *p = pg_toupper((unsigned char) *p);
+ pg_strtoupper(ret);
}
return ret;
diff --git a/src/include/port.h b/src/include/port.h
index 3d103a2b31..bbfb24b4d7 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -163,7 +163,9 @@ extern void pg_usleep(long microsec);
extern int pg_strcasecmp(const char *s1, const char *s2);
extern int pg_strncasecmp(const char *s1, const char *s2, size_t n);
extern unsigned char pg_toupper(unsigned char ch);
+extern void pg_strtoupper(char *str);
extern unsigned char pg_tolower(unsigned char ch);
+extern void pg_strtolower(char *str);
extern unsigned char pg_ascii_toupper(unsigned char ch);
extern unsigned char pg_ascii_tolower(unsigned char ch);
diff --git a/src/interfaces/ecpg/pgtypeslib/datetime.c b/src/interfaces/ecpg/pgtypeslib/datetime.c
index 1b253747fc..2ed91ed681 100644
--- a/src/interfaces/ecpg/pgtypeslib/datetime.c
+++ b/src/interfaces/ecpg/pgtypeslib/datetime.c
@@ -507,8 +507,7 @@ PGTYPESdate_defmt_asc(date * d, const char *fmt, const char *str)
return -1;
/* convert the whole string to lower case */
- for (i = 0; str_copy[i]; i++)
- str_copy[i] = (char) pg_tolower((unsigned char) str_copy[i]);
+ pg_strtolower(str_copy);
}
/* look for numerical tokens */
diff --git a/src/port/pgstrcasecmp.c b/src/port/pgstrcasecmp.c
index fc7ee91e8f..11549bdbed 100644
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -111,6 +111,23 @@ pg_toupper(unsigned char ch)
return ch;
}
+/*
+ * Fold a null-terminated string to upper case.
+ */
+void
+pg_strtoupper(char *str)
+{
+ unsigned char *p;
+
+ for (p = (unsigned char *) str; *p; p++)
+ {
+ if (*p >= 'a' && *p <= 'z')
+ *p += 'A' - 'a';
+ else if (IS_HIGHBIT_SET(*p) && islower(*p))
+ *p = toupper(*p);
+ }
+}
+
/*
* Fold a character to lower case.
*
@@ -128,6 +145,23 @@ pg_tolower(unsigned char ch)
return ch;
}
+/*
+ * Fold a null-terminated string to upper case.
+ */
+void
+pg_strtolower(char *str)
+{
+ unsigned char *p;
+
+ for (p = (unsigned char *) str; *p; p++)
+ {
+ if (*p >= 'A' && *p <= 'Z')
+ *p += 'a' - 'A';
+ else if (IS_HIGHBIT_SET(*p) && isupper(*p))
+ *p = tolower(*p);
+ }
+}
+
/*
* Fold a character to upper case, following C/POSIX locale rules.
*/
diff --git a/src/timezone/pgtz.c b/src/timezone/pgtz.c
index 3c278dd0f3..9820b21509 100644
--- a/src/timezone/pgtz.c
+++ b/src/timezone/pgtz.c
@@ -237,7 +237,6 @@ pg_tzset(const char *name)
struct state tzstate;
char uppername[TZ_STRLEN_MAX + 1];
char canonname[TZ_STRLEN_MAX + 1];
- char *p;
if (strlen(name) > TZ_STRLEN_MAX)
return NULL; /* not going to fit */
@@ -252,10 +251,8 @@ pg_tzset(const char *name)
* can get consistently upcased results from tzparse() in case the name is
* a POSIX-style timezone spec.)
*/
- p = uppername;
- while (*name)
- *p++ = pg_toupper((unsigned char) *name++);
- *p = '\0';
+ strlcpy(uppername, name, sizeof(uppername));
+ pg_strtoupper(uppername);
tzp = (pg_tz_cache *) hash_search(timezone_cache,
uppername,
--
2.25.1
On Mon, May 2, 2022 at 6:21 PM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:
Hi,
I came across pg_toupper and pg_tolower functions, converting a single
character, are being used in loops to convert an entire
null-terminated string. The cost of calling these character-based
conversion functions (even though small) can be avoided if we have two
new functions pg_strtoupper and pg_strtolower.
Have we measured the saving in cost? Let's say for a million character
long string?
Attaching a patch with these new two functions and their usage in most
of the possible places in the code.
Converting pg_toupper and pg_tolower to "inline" might save cost
similarly and also avoid code duplication?
--
Best Wishes,
Ashutosh Bapat
On Mon, May 2, 2022 at 6:43 PM Ashutosh Bapat
<ashutosh.bapat.oss@gmail.com> wrote:
On Mon, May 2, 2022 at 6:21 PM Bharath Rupireddy
<bharath.rupireddyforpostgres@gmail.com> wrote:Hi,
I came across pg_toupper and pg_tolower functions, converting a single
character, are being used in loops to convert an entire
null-terminated string. The cost of calling these character-based
conversion functions (even though small) can be avoided if we have two
new functions pg_strtoupper and pg_strtolower.Have we measured the saving in cost? Let's say for a million character
long string?
I didn't spend time on figuring out the use-cases hitting all the code
areas, even if I do so, the function call cost savings might not
impress most of the time and the argument of saving function call cost
then becomes pointless.
Attaching a patch with these new two functions and their usage in most
of the possible places in the code.Converting pg_toupper and pg_tolower to "inline" might save cost
similarly and also avoid code duplication?
I think most of the modern compilers do inline small functions. But,
inlining isn't always good as it increases the size of the code. With
the proposed helper functions, the code looks cleaner (at least IMO,
others may have different opinions though).
Regards,
Bharath Rupireddy.
On 2022-May-02, Bharath Rupireddy wrote:
Hi,
I came across pg_toupper and pg_tolower functions, converting a single
character, are being used in loops to convert an entire
null-terminated string. The cost of calling these character-based
conversion functions (even though small) can be avoided if we have two
new functions pg_strtoupper and pg_strtolower.
Currently, pg_toupper/pg_tolower are used in very limited situations.
Are they really always safe enough to run in arbitrary situations,
enough to create this new layer on top of them? Reading the comment on
pg_tolower, "the whole thing is a bit bogus for multibyte charsets", I
worry that we might create security holes, either now or in future
callsites that use these new functions.
Consider that in the Turkish locale you lowercase an I (single-byte
ASCII character) with a dotless-i (two bytes). So overwriting the input
string is not a great solution.
--
Álvaro Herrera PostgreSQL Developer — https://www.EnterpriseDB.com/
"Nunca se desea ardientemente lo que solo se desea por razón" (F. Alexandre)
Alvaro Herrera <alvherre@alvh.no-ip.org> writes:
Currently, pg_toupper/pg_tolower are used in very limited situations.
Are they really always safe enough to run in arbitrary situations,
enough to create this new layer on top of them?
They are not, and we should absolutely not be encouraging additional uses
of them. The existing multi-character str_toupper/str_tolower functions
should be used instead. (Perhaps those should be relocated to someplace
more prominent?)
Reading the comment on
pg_tolower, "the whole thing is a bit bogus for multibyte charsets", I
worry that we might create security holes, either now or in future
callsites that use these new functions.
I doubt that they are security holes, but they do give unexpected
answers in some locales.
regards, tom lane