[PATCH] Add roman support for to_number function

Started by Hunaid Sohailover 1 year ago31 messages

hunaidpgml@gmail.com

over 1 year ago

1 attachment(s)

Hi,

While looking at formatting.c file, I noticed a TODO about "add support for
roman number to standard number conversion" (
https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/formatting.c#L52
)

I have attached the patch that adds support for this and converts roman
numerals to standard numbers (1 to 3999) while also checking if roman
numerals are valid or not.
I have also added a new error code: ERRCODE_INVALID_ROMAN_NUMERAL in case
of invalid numerals.

A few examples:

postgres=# SELECT to_number('MC', 'RN');
to_number
-----------
1100
(1 row)

postgres=# SELECT to_number('XIV', 'RN');
to_number
-----------
14
(1 row)

postgres=# SELECT to_number('MMMCMXCIX', 'RN');
to_number
-----------
3999
(1 row)

postgres=# SELECT to_number('MCCD', 'RN');
ERROR: invalid roman numeral

I would appreciate your feedback on the following cases:
- Is it okay for the function to handle Roman numerals in a
case-insensitive way? (e.g., 'XIV', 'xiv', and 'Xiv' are all seen as 14).
- How should we handle Roman numerals with leading or trailing spaces, like
' XIV' or 'MC '? Should we trim the spaces, or would it be better to throw
an error in such cases?

I have tested the changes and would appreciate any suggestions for
improvement. I will update the docs and regressions in the next version of
patch.

Regards,
Hunaid Sohail

Attachments:

v1-0001-Add-RN-rn-support-for-to_number-function.patchapplication/octet-stream; name=v1-0001-Add-RN-rn-support-for-to_number-function.patchDownload

From 7afb573fe81522b61a3e486b9f1868f3e24198e8 Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Fri, 30 Aug 2024 11:52:48 +0500
Subject: [PATCH v1] Add RN/rn support for to_number function

---
 src/backend/utils/adt/formatting.c | 126 ++++++++++++++++++++++++++++-
 src/backend/utils/errcodes.txt     |   1 +
 2 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68069fcfd3..1dc3d435ee 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -270,6 +270,29 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1097,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -5236,6 +5260,91 @@ int_to_roman(int number)
 }
 
 
+static int
+roman_to_int(char* s, int len) {
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	if (len == 0 || len > 15)
+		return 0;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return 0;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return 0;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return 0;
+
+		if (i < len - 1)
+		{
+			char nextChar = toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return 0;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* for cases where the same character is repeated
+				 * with subtraction. Like 'MCCM' or 'DCCCD'.
+				 */
+				if (repeatCount > 1)
+					return 0;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return 0;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* for same characters, check for repetition */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return 0;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* add the value of the last character */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5787,6 +5896,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 			   *Np = &_Np;
 	const char *pattern;
 	int			pattern_len;
+	int			roman_result;
 
 	MemSet(Np, 0, sizeof(NUMProc));
 
@@ -5817,9 +5927,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == 0)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ROMAN_NUMERAL),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index b43a24d4bc..90db6c640f 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -201,6 +201,7 @@ Section: Class 22 - Data Exception
 22P03    E    ERRCODE_INVALID_BINARY_REPRESENTATION                          invalid_binary_representation
 22P04    E    ERRCODE_BAD_COPY_FILE_FORMAT                                   bad_copy_file_format
 22P05    E    ERRCODE_UNTRANSLATABLE_CHARACTER                               untranslatable_character
+22P06    E    ERRCODE_INVALID_ROMAN_NUMERAL                                  invalid_roman_numeral
 2200L    E    ERRCODE_NOT_AN_XML_DOCUMENT                                    not_an_xml_document
 2200M    E    ERRCODE_INVALID_XML_DOCUMENT                                   invalid_xml_document
 2200N    E    ERRCODE_INVALID_XML_CONTENT                                    invalid_xml_content
-- 
2.34.1

Maciek Sakrejda

m.sakrejda@gmail.com

over 1 year ago

In reply to: Hunaid Sohail (#1)

Re: [PATCH] Add roman support for to_number function

Thanks for the contribution.

I took a look at the patch, and it works as advertised. It's too late
for the September commitfest, but I took the liberty of registering
your patch for the November CF [1]https://commitfest.postgresql.org/50/5233/. In the course of that, I found an
older thread proposing this feature seven years ago [2]/messages/by-id/CAGMVOduAJ9wKqJXBYnmFmEetKxapJxrG3afUwpbOZ6n_dWaUnA@mail.gmail.com. That patch
was returned with feedback and (as far as I can tell), was not
followed-up on by the author. You may want to review that thread for
feedback; I won't repeat it here.

On Fri, Aug 30, 2024 at 12:22 AM Hunaid Sohail <hunaidpgml@gmail.com> wrote:

While looking at formatting.c file, I noticed a TODO about "add support for roman number to standard number conversion" (https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/formatting.c#L52)

Your patch should also remove the TODO =)

- Is it okay for the function to handle Roman numerals in a case-insensitive way? (e.g., 'XIV', 'xiv', and 'Xiv' are all seen as 14).

The patch in the thread I linked also took a case-insensitive
approach. I did not see any objections to that, and it seems
reasonable to me as well.

- How should we handle Roman numerals with leading or trailing spaces, like ' XIV' or 'MC '? Should we trim the spaces, or would it be better to throw an error in such cases?

I thought we could reference existing to_number behavior here, but
after playing with it a bit, I'm not really sure what that is:

-- single leading space
maciek=# select to_number(' 1', '9');
to_number
-----------
1
(1 row)

-- two leading spaces
maciek=# select to_number(' 1', '9');
ERROR: invalid input syntax for type numeric: " "
-- two leading spaces and template pattern with a decimal
maciek=# select to_number(' 1', '9D9');
to_number
-----------
1
(1 row)

Separately, I also noticed some unusual Roman representations work
with your patch:

postgres=# select to_number('viv', 'RN');
to_number
-----------
9
(1 row)

Is this expected? In contrast, some somewhat common alternative
representations don't work:

postgres=# select to_number('iiii', 'RN');
ERROR: invalid roman numeral

I know this is expected, but is this the behavior we want? If so, we
probably want to reject the former case, too. If not, maybe that one
is okay, too.

I know I've probably offered more questions than answers, but I hope
finding the old thread here is useful.

Thanks,
Maciek

[1]: https://commitfest.postgresql.org/50/5233/
[2]: /messages/by-id/CAGMVOduAJ9wKqJXBYnmFmEetKxapJxrG3afUwpbOZ6n_dWaUnA@mail.gmail.com

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Maciek Sakrejda (#2)

Re: [PATCH] Add roman support for to_number function

On Mon, Sep 2, 2024 at 11:41 PM Maciek Sakrejda <m.sakrejda@gmail.com>
wrote:

Thanks for the contribution.

I took a look at the patch, and it works as advertised. It's too late
for the September commitfest, but I took the liberty of registering
your patch for the November CF [1]. In the course of that, I found an
older thread proposing this feature seven years ago [2]. That patch
was returned with feedback and (as far as I can tell), was not
followed-up on by the author. You may want to review that thread for
feedback; I won't repeat it here.

I submitted the patch on Aug 30 because I read that new patches should be
submitted in CF with "Open" status.

On Fri, Aug 30, 2024 at 12:22 AM Hunaid Sohail <hunaidpgml@gmail.com>
wrote:

While looking at formatting.c file, I noticed a TODO about "add support

for roman number to standard number conversion" (
https://github.com/postgres/postgres/blob/master/src/backend/utils/adt/formatting.c#L52
)

Your patch should also remove the TODO =)

Noted.

- How should we handle Roman numerals with leading or trailing spaces,
like ' XIV' or 'MC '? Should we trim the spaces, or would it be better to
throw an error in such cases?

I thought we could reference existing to_number behavior here, but
after playing with it a bit, I'm not really sure what that is:

-- single leading space
maciek=# select to_number(' 1', '9');
to_number
-----------
1
(1 row)

-- two leading spaces
maciek=# select to_number(' 1', '9');
ERROR: invalid input syntax for type numeric: " "
-- two leading spaces and template pattern with a decimal
maciek=# select to_number(' 1', '9D9');
to_number
-----------
1
(1 row)

Yes, you are right. I can't understand the behaviour of trailing spaces
too. Trailing spaces are ignored (doesn't matter how many spaces) but
leading spaces are ignored if there is 1 leading space. For more leading
spaces, error is returned.
A few cases of trailing spaces.
postgres=# select to_number(' 1', '9');
ERROR: invalid input syntax for type numeric: " "
postgres=# select to_number('1 ', '9');
to_number
-----------
1
(1 row)

postgres=# select to_number('1 ', '9');
to_number
-----------
1
(1 row)

Separately, I also noticed some unusual Roman representations work

with your patch:

postgres=# select to_number('viv', 'RN');
to_number
-----------
9
(1 row)

Is this expected? In contrast, some somewhat common alternative
representations don't work:

postgres=# select to_number('iiii', 'RN');
ERROR: invalid roman numeral

I know this is expected, but is this the behavior we want? If so, we
probably want to reject the former case, too. If not, maybe that one
is okay, too.

Yes, 'viv' is invalid. Thanks for pointing this out. Also, found a few
other invalid cases like 'lxl' or 'dcd'. I will fix them in the next patch.
Thank you for your feedback.

Regards,
Hunaid Sohail

Maciek Sakrejda

maciek@pganalyze.com

over 1 year ago

In reply to: Hunaid Sohail (#3)

Re: [PATCH] Add roman support for to_number function

On Tue, Sep 3, 2024 at 6:29 AM Hunaid Sohail <hunaidpgml@gmail.com> wrote:

I submitted the patch on Aug 30 because I read that new patches should be submitted in CF with "Open" status.

Oh my bad! I missed that you had submitted it to the September CF:
https://commitfest.postgresql.org/49/5221/

I don't see a way to just delete CF entries, so I've marked the
November one that I created as Withdrawn.

I'll add myself as reviewer to the September CF entry.

Thanks,
Maciek

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Maciek Sakrejda (#4)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

I have attached a new patch v2 with following changes:

- Handled invalid cases like 'viv', 'lxl', and 'dcd'.
- Changed errcode to 22P07 because 22P06 was already taken.
- Removed TODO.
- Added a few positive & negative tests.
- Updated documentation.

Looking forward to your feedback.

Regards,
Hunaid Sohail

On Tue, Sep 3, 2024 at 8:47 PM Maciek Sakrejda <maciek@pganalyze.com> wrote:

Show quoted text

On Tue, Sep 3, 2024 at 6:29 AM Hunaid Sohail <hunaidpgml@gmail.com> wrote:

I submitted the patch on Aug 30 because I read that new patches should

be submitted in CF with "Open" status.

Oh my bad! I missed that you had submitted it to the September CF:
https://commitfest.postgresql.org/49/5221/

I don't see a way to just delete CF entries, so I've marked the
November one that I created as Withdrawn.

I'll add myself as reviewer to the September CF entry.

Thanks,
Maciek

Attachments:

v2-0001-Add-RN-rn-support-for-to_number-function.patchapplication/octet-stream; name=v2-0001-Add-RN-rn-support-for-to_number-function.patchDownload

From 08e75e0b5171dca51d35d2aa4f0cadf02628cbe6 Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Thu, 5 Sep 2024 12:54:42 +0500
Subject: [PATCH v2] Add RN/rn support for to_number function

---
 doc/src/sgml/func.sgml                |  11 ++-
 src/backend/utils/adt/formatting.c    | 135 +++++++++++++++++++++++++-
 src/backend/utils/errcodes.txt        |   1 +
 src/test/regress/expected/numeric.out |  40 ++++++++
 src/test/regress/sql/numeric.sql      |  10 ++
 5 files changed, 192 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 461fc3f437..e8adf28940 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8628,7 +8628,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8754,6 +8754,15 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts
+        roman numerals to standard numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        When using <literal>RN</literal>, other format elements are ignored.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68069fcfd3..fb12bf91d0 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -270,6 +269,29 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1096,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -5236,6 +5259,99 @@ int_to_roman(int number)
 }
 
 
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	if (len == 0 || len > 15)
+		return 0;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return 0;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return 0;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return 0;
+
+		if (i < len - 1)
+		{
+			char nextChar = toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return 0;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return 0;
+
+				/* for cases where the same character is repeated
+				 * with subtraction. Like 'MCCM' or 'DCCCD'.
+				 */
+				if (repeatCount > 1)
+					return 0;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return 0;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* for same characters, check for repetition */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return 0;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* add the value of the last character */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5787,6 +5903,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 			   *Np = &_Np;
 	const char *pattern;
 	int			pattern_len;
+	int			roman_result;
 
 	MemSet(Np, 0, sizeof(NUMProc));
 
@@ -5817,9 +5934,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == 0)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ROMAN_NUMERAL),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index b43a24d4bc..a64188da82 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -201,6 +201,7 @@ Section: Class 22 - Data Exception
 22P03    E    ERRCODE_INVALID_BINARY_REPRESENTATION                          invalid_binary_representation
 22P04    E    ERRCODE_BAD_COPY_FILE_FORMAT                                   bad_copy_file_format
 22P05    E    ERRCODE_UNTRANSLATABLE_CHARACTER                               untranslatable_character
+22P07    E    ERRCODE_INVALID_ROMAN_NUMERAL                                  invalid_roman_numeral
 2200L    E    ERRCODE_NOT_AN_XML_DOCUMENT                                    not_an_xml_document
 2200M    E    ERRCODE_INVALID_XML_DOCUMENT                                   invalid_xml_document
 2200N    E    ERRCODE_INVALID_XML_CONTENT                                    invalid_xml_content
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index f30ac236f5..d20603afc3 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2297,6 +2297,46 @@ SELECT to_number('42nd', '99th');
         42
 (1 row)
 
+SELECT to_number('XIV', 'RN');
+ to_number 
+-----------
+        14
+(1 row)
+
+SELECT to_number('DCCCXLV', 'rn');
+ to_number 
+-----------
+       845
+(1 row)
+
+SELECT to_number('mmxxiv', 'RN');
+ to_number 
+-----------
+      2024
+(1 row)
+
+SELECT to_number('MMMCMXCIX', 'RN');
+ to_number 
+-----------
+      3999
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index c86395209a..42331558d1 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1070,6 +1070,16 @@ SELECT to_number('$1,234.56','L99,999.99');
 SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
+SELECT to_number('XIV', 'RN');
+SELECT to_number('DCCCXLV', 'rn');
+SELECT to_number('mmxxiv', 'RN');
+SELECT to_number('MMMCMXCIX', 'RN');
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1

Aleksander Alekseev

aleksander@timescale.com

over 1 year ago

In reply to: Hunaid Sohail (#5)

Re: [PATCH] Add roman support for to_number function

Hi,

I have attached a new patch v2 with following changes:

- Handled invalid cases like 'viv', 'lxl', and 'dcd'.
- Changed errcode to 22P07 because 22P06 was already taken.
- Removed TODO.
- Added a few positive & negative tests.
- Updated documentation.

Looking forward to your feedback.

While playing with the patch I noticed that to_char(..., 'RN') doesn't
seem to be test-covered. I suggest adding the following test:

```
WITH rows AS (
SELECT i, to_char(i, 'FMRN') AS roman
FROM generate_series(1, 3999) AS i
) SELECT bool_and(to_number(roman, 'RN') = i) FROM rows;

bool_and
----------
t
```

... in order to fix this while on it. The query takes ~12 ms on my laptop.

--
Best regards,
Aleksander Alekseev

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Aleksander Alekseev (#6)

Re: [PATCH] Add roman support for to_number function

On Thu, Sep 5, 2024 at 2:41 PM Aleksander Alekseev <aleksander@timescale.com>
wrote:

While playing with the patch I noticed that to_char(..., 'RN') doesn't
seem to be test-covered. I suggest adding the following test:

```
WITH rows AS (
SELECT i, to_char(i, 'FMRN') AS roman
FROM generate_series(1, 3999) AS i
) SELECT bool_and(to_number(roman, 'RN') = i) FROM rows;

bool_and
----------
t
```

I also noticed there are no tests for to_char roman format. The test you
provided covers roman format in both to_char and to_number. I will add it.
Thank you.

Regards,
Hunaid Sohail

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Hunaid Sohail (#7)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

On Thu, Sep 5, 2024 at 2:41 PM Aleksander Alekseev <aleksander@timescale.com>

wrote:

While playing with the patch I noticed that to_char(..., 'RN') doesn't
seem to be test-covered. I suggest adding the following test:

```
WITH rows AS (
SELECT i, to_char(i, 'FMRN') AS roman
FROM generate_series(1, 3999) AS i
) SELECT bool_and(to_number(roman, 'RN') = i) FROM rows;

bool_and
----------
t
```

I have added this test in the attached patch (v3). Thank you once again,
Aleksander, for the suggestion.

Regards,
Hunaid Sohail

Attachments:

v3-0001-Add-RN-rn-support-for-to_number-function.patchapplication/octet-stream; name=v3-0001-Add-RN-rn-support-for-to_number-function.patchDownload

From 325f8918b5fc03097ce6397f023738fbbe41546b Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Sat, 7 Sep 2024 14:14:11 +0500
Subject: [PATCH v3] Add RN/rn support for to_number function

---
 doc/src/sgml/func.sgml                |  11 ++-
 src/backend/utils/adt/formatting.c    | 135 +++++++++++++++++++++++++-
 src/backend/utils/errcodes.txt        |   1 +
 src/test/regress/expected/numeric.out |  27 ++++++
 src/test/regress/sql/numeric.sql      |  14 +++
 5 files changed, 183 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 461fc3f437..e8adf28940 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8628,7 +8628,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8754,6 +8754,15 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts
+        roman numerals to standard numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        When using <literal>RN</literal>, other format elements are ignored.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68069fcfd3..fb12bf91d0 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -270,6 +269,29 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1096,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -5236,6 +5259,99 @@ int_to_roman(int number)
 }
 
 
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	if (len == 0 || len > 15)
+		return 0;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return 0;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return 0;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return 0;
+
+		if (i < len - 1)
+		{
+			char nextChar = toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return 0;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return 0;
+
+				/* for cases where the same character is repeated
+				 * with subtraction. Like 'MCCM' or 'DCCCD'.
+				 */
+				if (repeatCount > 1)
+					return 0;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return 0;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* for same characters, check for repetition */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return 0;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* add the value of the last character */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5787,6 +5903,7 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 			   *Np = &_Np;
 	const char *pattern;
 	int			pattern_len;
+	int			roman_result;
 
 	MemSet(Np, 0, sizeof(NUMProc));
 
@@ -5817,9 +5934,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == 0)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ROMAN_NUMERAL),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index b43a24d4bc..a64188da82 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -201,6 +201,7 @@ Section: Class 22 - Data Exception
 22P03    E    ERRCODE_INVALID_BINARY_REPRESENTATION                          invalid_binary_representation
 22P04    E    ERRCODE_BAD_COPY_FILE_FORMAT                                   bad_copy_file_format
 22P05    E    ERRCODE_UNTRANSLATABLE_CHARACTER                               untranslatable_character
+22P07    E    ERRCODE_INVALID_ROMAN_NUMERAL                                  invalid_roman_numeral
 2200L    E    ERRCODE_NOT_AN_XML_DOCUMENT                                    not_an_xml_document
 2200M    E    ERRCODE_INVALID_XML_DOCUMENT                                   invalid_xml_document
 2200N    E    ERRCODE_INVALID_XML_CONTENT                                    invalid_xml_content
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index f30ac236f5..b29747034a 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2297,6 +2297,33 @@ SELECT to_number('42nd', '99th');
         42
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index c86395209a..b3a09758ff 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1070,6 +1070,20 @@ SELECT to_number('$1,234.56','L99,999.99');
 SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1

Maciek Sakrejda

m.sakrejda@gmail.com

over 1 year ago

In reply to: Hunaid Sohail (#8)

Re: [PATCH] Add roman support for to_number function

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, failed
Spec compliant: tested, failed
Documentation: tested, passed

Tested again, and the patch looks good. It does not accept leading or trailing whitespace, which seems reasonable, given the unclear behavior of to_number with other format strings. It also rejects less common Roman spellings like "IIII". I don't feel strongly about that one way or the other, but perhaps a test codifying that behavior would be useful to make it clear it's intentional.

I'm marking it RfC.

The new status of this patch is: Ready for Committer

#10

Maciek Sakrejda

m.sakrejda@gmail.com

over 1 year ago

In reply to: Maciek Sakrejda (#9)

Re: [PATCH] Add roman support for to_number function

Sorry, it looks like I failed to accurately log my review in the
review app due to the current broken layout issues [1]/messages/by-id/CAD68Dp1GgTeBiA0YVWXpfPCMC7=nqBCLn6+huOkWURcKRnFw5g@mail.gmail.com. The summary
should be:

make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: not tested (not sure what the spec has to
say about this)
Documentation: tested, passed

[1]: /messages/by-id/CAD68Dp1GgTeBiA0YVWXpfPCMC7=nqBCLn6+huOkWURcKRnFw5g@mail.gmail.com

#11

Tom Lane

tgl@sss.pgh.pa.us

over 1 year ago

In reply to: Maciek Sakrejda (#9)

Re: [PATCH] Add roman support for to_number function

Maciek Sakrejda <m.sakrejda@gmail.com> writes:

Tested again, and the patch looks good. It does not accept leading or trailing whitespace, which seems reasonable, given the unclear behavior of to_number with other format strings. It also rejects less common Roman spellings like "IIII". I don't feel strongly about that one way or the other, but perhaps a test codifying that behavior would be useful to make it clear it's intentional.

Yeah, I don't have a strong feeling about that either, but probably
being strict is better. to_number has a big problem with "garbage in
garbage out" already, and being lax will make that worse.

A few notes from a quick read of the patch:

* roman_to_int() should have a header comment, notably explaining its
result convention. I find it fairly surprising that "0" means an
error --- I realize that Roman notation can't represent zero, but
wouldn't it be better to use -1?

* Do *not* rely on toupper(). There are multiple reasons why not,
but in this context the worst one is that in Turkish locales it's
likely to translate "i" to "İ", on which you will fail. I'd use
pg_ascii_toupper().

* I think roman_to_int() is under-commented internally too.
To start with, where did the magic "15" come from? And why
should we have such a test anyway --- what if the format allows
for trailing stuff after the roman numeral? (That is, I think
you need to fix this so that roman_to_int reports how much data
it ate, instead of assuming that it must read to the end of the
input.) The logic for detecting invalid numeral combinations
feels a little opaque too. Do you have a source for the rules
you're implementing, and if so could you cite it?

* This code will get run through pgindent before commit, so you
might want to revisit whether your comments will still look nice
afterwards. There's not a lot of consistency in them about initial
cap versus lower case or trailing period versus not, too.

* roman_result could be declared where used, no?

* I'm not really convinced that we need a new errcode
ERRCODE_INVALID_ROMAN_NUMERAL rather than using a generic one like
ERRCODE_INVALID_TEXT_REPRESENTATION. However, if we do do it
like that, why did you pick the out-of-sequence value 22P07?

* Might be good to have a few test cases demonstrating what happens
when there's other stuff combined with the RN format spec. Notably,
even if RN itself won't eat whitespace, there had better be a way
to write the format string to allow that.

* Further to Aleksander's point about lack of test coverage for
the to_char direction, I see from
https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html
that almost none of the existing roman-number code paths are covered
today. While it's not strictly within the charter of this patch
to improve that, maybe we should try to do so --- at the very
least it'd raise formatting.c's coverage score a few notches.

regards, tom lane

#12

Tom Lane

tgl@sss.pgh.pa.us

over 1 year ago

In reply to: Tom Lane (#11)

Re: [PATCH] Add roman support for to_number function

I wrote:

* Further to Aleksander's point about lack of test coverage for
the to_char direction, I see from
https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html
that almost none of the existing roman-number code paths are covered
today. While it's not strictly within the charter of this patch
to improve that, maybe we should try to do so --- at the very
least it'd raise formatting.c's coverage score a few notches.

For the archives' sake: I created a patch and a separate discussion
thread [1]/messages/by-id/2956175.1725831136@sss.pgh.pa.us for that.

regards, tom lane

[1]: /messages/by-id/2956175.1725831136@sss.pgh.pa.us

#13

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Tom Lane (#11)

Re: [PATCH] Add roman support for to_number function

Hi,

On Sun, Sep 8, 2024 at 4:09 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:

A few notes from a quick read of the patch:

* roman_to_int() should have a header comment, notably explaining its
result convention. I find it fairly surprising that "0" means an
error --- I realize that Roman notation can't represent zero, but
wouldn't it be better to use -1?

Noted.

* Do *not* rely on toupper(). There are multiple reasons why not,
but in this context the worst one is that in Turkish locales it's
likely to translate "i" to "İ", on which you will fail. I'd use
pg_ascii_toupper().

Noted.

* I think roman_to_int() is under-commented internally too.
To start with, where did the magic "15" come from? And why
should we have such a test anyway --- what if the format allows
for trailing stuff after the roman numeral? (That is, I think
you need to fix this so that roman_to_int reports how much data
it ate, instead of assuming that it must read to the end of the
input.)

MMMDCCCLXXXVIII is the longest valid standard roman numeral (15
characters). I will add appropriate comment on length check.

I am not sure I am able to understand the latter part. Could you please
elaborate it?
Are you referring to cases like these?
SELECT to_number('CXXIII foo', 'RN foo');
SELECT to_number('CXXIII foo', 'RN');
Please check my comments on Oracle's behaviour below.

The logic for detecting invalid numeral combinations
feels a little opaque too. Do you have a source for the rules
you're implementing, and if so could you cite it?

There are many sources on the internet.
A few sources:
1. https://www.toppr.com/guides/maths/knowing-our-numbers/roman-numerals/
2. https://www.cuemath.com/numbers/roman-numerals/

Note that we are following the standard form:
https://en.wikipedia.org/wiki/Roman_numerals#Standard_form

* This code will get run through pgindent before commit, so you
might want to revisit whether your comments will still look nice
afterwards. There's not a lot of consistency in them about initial
cap versus lower case or trailing period versus not, too.

Noted.

* roman_result could be declared where used, no?

Noted.

* I'm not really convinced that we need a new errcode
ERRCODE_INVALID_ROMAN_NUMERAL rather than using a generic one like
ERRCODE_INVALID_TEXT_REPRESENTATION. However, if we do do it
like that, why did you pick the out-of-sequence value 22P07?

22P07 is not out-of-sequence. 22P01 to 22P06 are already used.
However, I do agree with you that we can use
ERRCODE_INVALID_TEXT_REPRESENTATION. I will change it.

* Might be good to have a few test cases demonstrating what happens
when there's other stuff combined with the RN format spec. Notably,
even if RN itself won't eat whitespace, there had better be a way
to write the format string to allow that.

The current patch (v3) simply ignores other formats with RN except when RN
is used EEEE which returns error.
```
postgres=# SELECT to_number('CXXIII', 'foo RN');
to_number
-----------
123
(1 row)

postgres=# SELECT to_number('CXXIII', 'MIrn');
to_number
-----------
123
(1 row)

postgres=# SELECT to_number('CXXIII', 'EEEErn');
ERROR: "EEEE" must be the last pattern used
```

However, I think that other formats except FM should be rejected when used
with RN in NUMDesc_prepare function. Any opinions?

If we look into Oracle, they are strict in this regard and don't allow any
other format with RN.
1. SELECT to_char(12, 'MIrn') from dual;
2. SELECT to_char(12, 'foo RN') from dual;
results in error:
ORA-01481: invalid number format model

I also found that there is no check when RN is used twice (both in to_char
and to_number) and that results in unexpected output.

```
postgres=# SELECT to_number('CXXIII', 'RNrn');
to_number
-----------
123
(1 row)

postgres=# SELECT to_char(12, 'RNrn');
to_char
--------------------------------
XII xii
(1 row)

postgres=# SELECT to_char(12, 'rnrn');
to_char
--------------------------------
xii xii
(1 row)

postgres=# SELECT to_char(12, 'FMrnrn');
to_char
---------
xiixii
(1 row)
```

* Further to Aleksander's point about lack of test coverage for
the to_char direction, I see from

https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html
that almost none of the existing roman-number code paths are covered
today. While it's not strictly within the charter of this patch
to improve that, maybe we should try to do so --- at the very
least it'd raise formatting.c's coverage score a few notches.

I see that you have provided a patch to increase test coverage of to_char
roman format including some other formats. I will try to add more tests for
the to_number roman format.

I will provide the next patch soon.

Regards,
Hunaid Sohail

#14

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Hunaid Sohail (#13)

Re: [PATCH] Add roman support for to_number function

Hi,

I have started working on the next version of the patch and have addressed
the smaller issues identified by Tom. Before proceeding further, I would
like to have opinions on some comments/questions in my previous email.

Regards,
Hunaid Sohail

On Mon, Sep 9, 2024 at 5:45 PM Hunaid Sohail <hunaidpgml@gmail.com> wrote:

Show quoted text

Hi,

On Sun, Sep 8, 2024 at 4:09 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:

A few notes from a quick read of the patch:

* roman_to_int() should have a header comment, notably explaining its
result convention. I find it fairly surprising that "0" means an
error --- I realize that Roman notation can't represent zero, but
wouldn't it be better to use -1?

Noted.

* Do *not* rely on toupper(). There are multiple reasons why not,
but in this context the worst one is that in Turkish locales it's
likely to translate "i" to "İ", on which you will fail. I'd use
pg_ascii_toupper().

Noted.

* I think roman_to_int() is under-commented internally too.
To start with, where did the magic "15" come from? And why
should we have such a test anyway --- what if the format allows
for trailing stuff after the roman numeral? (That is, I think
you need to fix this so that roman_to_int reports how much data
it ate, instead of assuming that it must read to the end of the
input.)

MMMDCCCLXXXVIII is the longest valid standard roman numeral (15
characters). I will add appropriate comment on length check.

I am not sure I am able to understand the latter part. Could you please
elaborate it?
Are you referring to cases like these?
SELECT to_number('CXXIII foo', 'RN foo');
SELECT to_number('CXXIII foo', 'RN');
Please check my comments on Oracle's behaviour below.

The logic for detecting invalid numeral combinations
feels a little opaque too. Do you have a source for the rules
you're implementing, and if so could you cite it?

There are many sources on the internet.
A few sources:
1. https://www.toppr.com/guides/maths/knowing-our-numbers/roman-numerals/
2. https://www.cuemath.com/numbers/roman-numerals/

Note that we are following the standard form:
https://en.wikipedia.org/wiki/Roman_numerals#Standard_form

* This code will get run through pgindent before commit, so you
might want to revisit whether your comments will still look nice
afterwards. There's not a lot of consistency in them about initial
cap versus lower case or trailing period versus not, too.

Noted.

* roman_result could be declared where used, no?

Noted.

* I'm not really convinced that we need a new errcode
ERRCODE_INVALID_ROMAN_NUMERAL rather than using a generic one like
ERRCODE_INVALID_TEXT_REPRESENTATION. However, if we do do it
like that, why did you pick the out-of-sequence value 22P07?

22P07 is not out-of-sequence. 22P01 to 22P06 are already used.
However, I do agree with you that we can use
ERRCODE_INVALID_TEXT_REPRESENTATION. I will change it.

* Might be good to have a few test cases demonstrating what happens
when there's other stuff combined with the RN format spec. Notably,
even if RN itself won't eat whitespace, there had better be a way
to write the format string to allow that.

The current patch (v3) simply ignores other formats with RN except when RN
is used EEEE which returns error.
```
postgres=# SELECT to_number('CXXIII', 'foo RN');
to_number
-----------
123
(1 row)

postgres=# SELECT to_number('CXXIII', 'MIrn');
to_number
-----------
123
(1 row)

postgres=# SELECT to_number('CXXIII', 'EEEErn');
ERROR: "EEEE" must be the last pattern used
```

However, I think that other formats except FM should be rejected when used
with RN in NUMDesc_prepare function. Any opinions?

If we look into Oracle, they are strict in this regard and don't allow any
other format with RN.
1. SELECT to_char(12, 'MIrn') from dual;
2. SELECT to_char(12, 'foo RN') from dual;
results in error:
ORA-01481: invalid number format model

I also found that there is no check when RN is used twice (both in to_char
and to_number) and that results in unexpected output.

```
postgres=# SELECT to_number('CXXIII', 'RNrn');
to_number
-----------
123
(1 row)

postgres=# SELECT to_char(12, 'RNrn');
to_char
--------------------------------
XII xii
(1 row)

postgres=# SELECT to_char(12, 'rnrn');
to_char
--------------------------------
xii xii
(1 row)

postgres=# SELECT to_char(12, 'FMrnrn');
to_char
---------
xiixii
(1 row)
```

* Further to Aleksander's point about lack of test coverage for
the to_char direction, I see from

https://coverage.postgresql.org/src/backend/utils/adt/formatting.c.gcov.html
that almost none of the existing roman-number code paths are covered
today. While it's not strictly within the charter of this patch
to improve that, maybe we should try to do so --- at the very
least it'd raise formatting.c's coverage score a few notches.

I see that you have provided a patch to increase test coverage of to_char
roman format including some other formats. I will try to add more tests for
the to_number roman format.

I will provide the next patch soon.

Regards,
Hunaid Sohail

#15

Hunaid Sohail

hunaidpgml@gmail.com

over 1 year ago

In reply to: Hunaid Sohail (#14)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

I have attached a new patch with following changes:
- Addressed minor issues identified by Tom.
- Rejected other formats with RN and updated the docs.
- Added a few more tests.

Regards,
Hunaid Sohail

Show quoted text

Attachments:

v4-0001-Add-roman-support-for-to_number-function.patchapplication/octet-stream; name=v4-0001-Add-roman-support-for-to_number-function.patchDownload

From e3392f75c3f5f72c5a520706974f38891730ed29 Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Thu, 19 Sep 2024 12:36:08 +0500
Subject: [PATCH v4] Add roman support for to_number function

---
 doc/src/sgml/func.sgml                |  13 ++-
 src/backend/utils/adt/formatting.c    | 154 +++++++++++++++++++++++++-
 src/test/regress/expected/numeric.out |  41 +++++++
 src/test/regress/sql/numeric.sql      |  21 ++++
 4 files changed, 224 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6f75bd0c7d..6a2afe4dcd 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8628,7 +8628,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8754,6 +8754,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts
+        roman numerals to standard numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        <literal>RN</literal> cannot be used in combination with any other formatting patterns
+        or modifiers, with the exception of <literal>FM</literal>, which is applicable only in
+        <function>to_char()</function> and is ignored in <function>to_number()</function>.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 68fa89418f..30188030c1 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -270,6 +269,31 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1098,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1284,6 +1309,15 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
+			if (IS_BLANK(num) || IS_LSIGN(num) || IS_BRACKET(num) ||
+				IS_MINUS(num) || IS_PLUS(num) || IS_MULTI(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("\"RN\" is incompatible with other formats")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -5232,7 +5266,107 @@ int_to_roman(int number)
 	return result;
 }
 
+/* Convert a standard roman numeral to an integer.
+ * Result is an integer between 1 and 3999.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	/* Ensure the input is not too long or empty.
+	 * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 chars).
+	 */
+	if (len == 0 || len > MAX_ROMAN_LEN)
+		return -1;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = pg_ascii_toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return -1;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
+
+		if (i < len - 1)
+		{
+			char nextChar = pg_ascii_toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return -1;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/* For cases where the same character is repeated
+				 * with subtraction (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
 
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* Add the value of the last character. */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5814,9 +5948,21 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			int			roman_result;
+
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == -1)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index f30ac236f5..91a3aa880c 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2297,6 +2297,47 @@ SELECT to_number('42nd', '99th');
         42
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('  XIV  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMXX  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('M CC', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number(' ', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index c86395209a..97c10ba5f5 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1070,6 +1070,27 @@ SELECT to_number('$1,234.56','L99,999.99');
 SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('  XIV  ', 'RN');
+SELECT to_number('MMXX  ', 'RN');
+SELECT to_number('M CC', 'RN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1

#16

Hunaid Sohail

hunaidpgml@gmail.com

about 1 year ago

In reply to: Hunaid Sohail (#15)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

I have attached a rebased patch if someone wants to review in the next CF.
No changes as compared to v4.

Regards,
Hunaid Sohail

Show quoted text

Attachments:

v5-0001-Add-roman-support-for-to_number-function.patchapplication/x-patch; name=v5-0001-Add-roman-support-for-to_number-function.patchDownload

From 951f3e9620f0d50fcdaff095652a07d6304b490c Mon Sep 17 00:00:00 2001
From: Hunaid Sohail <hunaid2000@gmail.com>
Date: Wed, 30 Oct 2024 12:00:47 +0500
Subject: [PATCH v5] Add roman support for to_number function

---
 doc/src/sgml/func.sgml                |  13 ++-
 src/backend/utils/adt/formatting.c    | 154 +++++++++++++++++++++++++-
 src/test/regress/expected/numeric.out |  41 +++++++
 src/test/regress/sql/numeric.sql      |  21 ++++
 4 files changed, 224 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 05f630c6a6..5f91f0401b 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8628,7 +8628,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8756,6 +8756,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts
+        roman numerals to standard numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        <literal>RN</literal> cannot be used in combination with any other formatting patterns
+        or modifiers, with the exception of <literal>FM</literal>, which is applicable only in
+        <function>to_char()</function> and is ignored in <function>to_number()</function>.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 85a7dd4561..5e5e763388 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -270,6 +269,31 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1074,6 +1098,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1284,6 +1309,15 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
+			if (IS_BLANK(num) || IS_LSIGN(num) || IS_BRACKET(num) ||
+				IS_MINUS(num) || IS_PLUS(num) || IS_MULTI(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("\"RN\" is incompatible with other formats")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -5247,7 +5281,107 @@ int_to_roman(int number)
 	return result;
 }
 
+/* Convert a standard roman numeral to an integer.
+ * Result is an integer between 1 and 3999.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	/* Ensure the input is not too long or empty.
+	 * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 chars).
+	 */
+	if (len == 0 || len > MAX_ROMAN_LEN)
+		return -1;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = pg_ascii_toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return -1;
+
+		/* Ensure no character greater than or equal to the subtracted
+		 * character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
+
+		if (i < len - 1)
+		{
+			char nextChar = pg_ascii_toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return -1;
+
+			/* If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/* For cases where the same character is repeated
+				 * with subtraction (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
 
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/* Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* Add the value of the last character. */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5829,9 +5963,21 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			int			roman_result;
+
+			roman_result = roman_to_int(inout, input_len);
+			if (roman_result == -1)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 0898107ec3..f141f094e6 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,47 @@ SELECT to_number('123456', '99999V99');
  1234.560000000000000000
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('  XIV  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMXX  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('M CC', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number(' ', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9da12c6b9e..1543c52233 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,27 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('  XIV  ', 'RN');
+SELECT to_number('MMXX  ', 'RN');
+SELECT to_number('M CC', 'RN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1

#17

Tomas Vondra

tomas@vondra.me

about 1 year ago

In reply to: Hunaid Sohail (#16)

Re: [PATCH] Add roman support for to_number function

Hi,

On 10/30/24 08:07, Hunaid Sohail wrote:

Hi,

I have attached a rebased patch if someone wants to review in the next
CF. No changes as compared to v4.

Regards,
Hunaid Sohail

Thanks for a nice patch. I did a quick review today, seems almost there,
I only have a couple minor comments:

1) Template Patterns for Numeric Formatting

Why the wording change? "input between 1 and 3999" sounds fine to me.

2) The docs say "standard numbers" but I'm not sure what that is? We
don't use that term anywhere else, I think. Same for "standard roman
numeral".

3) A couple comments need a bit of formatting. Multi-line comments
should start with an empty line, some lines are a bit too long.

4) I was wondering if the regression tests check all interesting cases,
and it seems none of the tests hit these two cases:

if (vCount > 1 || lCount > 1 || dCount > 1)
return -1;

and

if (!IS_VALID_SUB_COMB(currChar, nextChar))
return -1;

I haven't tried constructing tests to hit those cases, though.

Seems ready to go otherwise.

regards

--
Tomas Vondra

#18

Hunaid Sohail

hunaidpgml@gmail.com

about 1 year ago

In reply to: Tomas Vondra (#17)

Re: [PATCH] Add roman support for to_number function

Hi,

Thanks for reviewing this patch.

On Mon, Dec 9, 2024 at 3:07 AM Tomas Vondra <tomas@vondra.me> wrote:

Thanks for a nice patch. I did a quick review today, seems almost there,
I only have a couple minor comments:

1) Template Patterns for Numeric Formatting

Why the wording change? "input between 1 and 3999" sounds fine to me.

input... seemed to refer to numeric input for to_char roman format. But,
after roman support in to_number function shouldn't we modify it? It is the
reason I changed it to "valid for numbers 1 to 3999".

2) The docs say "standard numbers" but I'm not sure what that is? We

don't use that term anywhere else, I think. Same for "standard roman
numeral".

I will change "standard numbers" to "numbers".
Note that we are following the standard form. There are other forms too.
For example, 4 can be represented as IV (standard) or IIII (non standard).
https://en.wikipedia.org/wiki/Roman_numerals#Standard_form

3) A couple comments need a bit of formatting. Multi-line comments
should start with an empty line, some lines are a bit too long.

I will fix the multi-line formatting.

4) I was wondering if the regression tests check all interesting cases,
and it seems none of the tests hit these two cases:

if (vCount > 1 || lCount > 1 || dCount > 1)
return -1;

SELECT to_number('viv', 'RN') test covers this if statement for the
subtraction part. But, I noticed that no test covers simple counts of V, L,
D.
I will add SELECT to_number('VV', 'RN') for that.

and

if (!IS_VALID_SUB_COMB(currChar, nextChar))
return -1;

Noted. SELECT to_number('IL', 'RN') test can be added.

Regards,
Hunaid Sohail

#19

Hunaid Sohail

hunaidpgml@gmail.com

about 1 year ago

In reply to: Hunaid Sohail (#18)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

On Mon, Dec 9, 2024 at 3:07 AM Tomas Vondra <tomas@vondra.me> wrote:

Thanks for a nice patch. I did a quick review today, seems almost there,
I only have a couple minor comments:

1) Template Patterns for Numeric Formatting

Why the wording change? "input between 1 and 3999" sounds fine to me.

input... seemed to refer to numeric input for to_char roman format. But,
after roman support in to_number function shouldn't we modify it? It is the
reason I changed it to "valid for numbers 1 to 3999".

2) The docs say "standard numbers" but I'm not sure what that is? We

don't use that term anywhere else, I think. Same for "standard roman
numeral".

I will change "standard numbers" to "numbers".
Note that we are following the standard form. There are other forms too.
For example, 4 can be represented as IV (standard) or IIII (non standard).
https://en.wikipedia.org/wiki/Roman_numerals#Standard_form

3) A couple comments need a bit of formatting. Multi-line comments
should start with an empty line, some lines are a bit too long.

I will fix the multi-line formatting.

4) I was wondering if the regression tests check all interesting cases,
and it seems none of the tests hit these two cases:

if (vCount > 1 || lCount > 1 || dCount > 1)
return -1;

SELECT to_number('viv', 'RN') test covers this if statement for the
subtraction part. But, I noticed that no test covers simple counts of V, L,
D.
I will add SELECT to_number('VV', 'RN') for that.

and

if (!IS_VALID_SUB_COMB(currChar, nextChar))
return -1;

Noted. SELECT to_number('IL', 'RN') test can be added.

I’ve attached a new patch that addresses comments 2, 3, and 4 from Tomas.

Looking forward to your feedback.

Regards,
Hunaid Sohail

Attachments:

v6-0001-Add-roman-support-for-to_number-function.patchapplication/x-patch; name=v6-0001-Add-roman-support-for-to_number-function.patchDownload

From 585dc7a04179be9260ddbdf01d8c7caa48c3037d Mon Sep 17 00:00:00 2001
From: Hunaid2000 <hunaid2000@gmail.com>
Date: Fri, 13 Dec 2024 10:19:20 +0500
Subject: [PATCH v6] Add roman support for to_number function

---
 doc/src/sgml/func.sgml                |  13 ++-
 src/backend/utils/adt/formatting.c    | 159 +++++++++++++++++++++++++-
 src/test/regress/expected/numeric.out |  45 ++++++++
 src/test/regress/sql/numeric.sql      |  23 ++++
 4 files changed, 235 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 47370e581a..5a07e741d7 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8670,7 +8670,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (valid for numbers 1 to 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8798,6 +8798,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+    <listitem>
+      <para>
+        In <function>to_number</function>, <literal>RN</literal> pattern converts roman numerals
+        (standard form) to numbers. It is case-insensitive (e.g., <literal>'XIV'</literal>,
+        <literal>'xiv'</literal>, and <literal>'Xiv'</literal> are all seen as <literal>14</literal>).
+        <literal>RN</literal> cannot be used in combination with any other formatting patterns
+        or modifiers, with the exception of <literal>FM</literal>, which is applicable only in
+        <function>to_char()</function> and is ignored in <function>to_number()</function>.
+      </para>
+    </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 0dcb551511..22de73f2e6 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -271,6 +270,31 @@ static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "D
 static const char *const numTH[] = {"ST", "ND", "RD", "TH", NULL};
 static const char *const numth[] = {"st", "nd", "rd", "th", NULL};
 
+/* ----------
+ * MACRO: Check if the current and next characters
+ * form a valid subtraction combination for roman numerals
+ * ----------
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/* ----------
+ * MACRO: Roman number value
+ * ----------
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	(r) == 'V' ? 5 : \
+	(r) == 'X' ? 10 : \
+	(r) == 'L' ? 50 : \
+	(r) == 'C' ? 100 : \
+	(r) == 'D' ? 500 : \
+	(r) == 'M' ? 1000 : 0)
+
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Flags & Options:
  * ----------
@@ -1075,6 +1099,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int roman_to_int(char* s, int len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1285,6 +1310,15 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
+			if (IS_BLANK(num) || IS_LSIGN(num) || IS_BRACKET(num) ||
+				IS_MINUS(num) || IS_PLUS(num) || IS_MULTI(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("\"RN\" is incompatible with other formats")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -5351,7 +5385,114 @@ int_to_roman(int number)
 	return result;
 }
 
+/*
+ * Convert a roman numeral (standard form) to an integer.
+ * Result is an integer between 1 and 3999.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(char* s, int len)
+{
+	int repeatCount = 1;
+	int vCount = 0, lCount = 0, dCount = 0;
+	bool subtractionEncountered = false;
+	char lastSubtractedChar = 0;
+	int total = 0;
+
+	/*
+	 * Ensure the input is not too long or empty.
+	 * 'MMMDCCCLXXXVIII' (3888) is the longest
+	 * valid roman numeral (15 characters).
+	 */
+	if (len == 0 || len > MAX_ROMAN_LEN)
+		return -1;
+
+	for (int i = 0; i < len; ++i)
+	{
+		char currChar = pg_ascii_toupper(s[i]);
+		int currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			return -1;
+
+		/*
+		 * Ensure no character greater than or equal to the
+		 * subtracted character appears after the subtraction.
+		 */
+		if (subtractionEncountered && (currValue >= ROMAN_VAL(lastSubtractedChar)))
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V') vCount++;
+		if (currChar == 'L') lCount++;
+		if (currChar == 'D') dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
 
+		if (i < len - 1)
+		{
+			char nextChar = pg_ascii_toupper(s[i + 1]);
+			int nextValue = ROMAN_VAL(nextChar);
+
+			if (nextValue == 0)
+				return -1;
+
+			/*
+			 * If the current value is less than the next value,
+			 * handle subtraction. Verify valid subtractive
+			 * combinations and update the total accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V') vCount++;
+				if (nextChar == 'L') lCount++;
+				if (nextChar == 'D') dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/*
+				 * For cases where same character is repeated
+				 * with subtraction (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/*
+				 * Skip the next character as it is part of
+				 * the subtractive combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedChar = currChar;
+				total += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				total += currValue;
+			}
+		}
+		/* Add the value of the last character. */
+		else
+			total += currValue;
+	}
+
+	return total;
+}
 
 /* ----------
  * Locale
@@ -5933,9 +6074,19 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
+		{
+			int roman_result = roman_to_int(inout, input_len);
+			if (roman_result == -1)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						 errmsg("invalid roman numeral")));
+			else
+			{
+				Np->Num->pre = sprintf(number, "%d", roman_result);
+				Np->Num->post = 0;
+				return number;
+			}
+		}
 
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 0898107ec3..ac023d1a24 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,51 @@ SELECT to_number('123456', '99999V99');
  1234.560000000000000000
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('viv', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('VV', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('IL', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('  XIV  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('MMXX  ', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('M CC', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number('', 'RN');
+ERROR:  invalid roman numeral
+SELECT to_number(' ', 'RN');
+ERROR:  invalid roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9da12c6b9e..379a596c31 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,29 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('VV', 'RN');
+SELECT to_number('IL', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('  XIV  ', 'RN');
+SELECT to_number('MMXX  ', 'RN');
+SELECT to_number('M CC', 'RN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
 RESET lc_numeric;
 
 --
-- 
2.34.1

#20

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Hunaid Sohail (#19)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hunaid Sohail <hunaidpgml@gmail.com> writes:

I’ve attached a new patch that addresses comments 2, 3, and 4 from Tomas.

I'm still quite unhappy that this doesn't tolerate trailing
whitespace. That's not how other format patterns work, and it makes
it impossible to round-trip the output of to_char(..., 'RN').
I think the core of the problem is that you tried to cram the logic
in where the existing "not implemented" error is thrown. But on
inspection there is nothing sane about that entire "Roman correction"
stanza -- what it does is either useless (zeroing already-zeroed
fields) or in the wrong place (if we don't want to allow other flags
with IS_ROMAN, that should be done via an error in NUMDesc_prepare,
like other similar cases). In the attached I moved the roman_to_int
call into NUM_processor's main loop so it's more like other format
patterns, and fixed it to not eat any more characters than it should.
This might allow putting back some format-combination capabilities,
but other than the whitespace angle I figure we can leave that for
people to request it.

regards, tom lane

Attachments:

v7-0001-Add-roman-support-for-to_number-function.patchtext/x-diff; charset=us-ascii; name=v7-0001-Add-roman-support-for-to_number-function.patchDownload

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 47370e581a..5678e7621a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8669,8 +8669,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
         <entry>plus/minus sign in specified position</entry>
        </row>
        <row>
-        <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry><literal>RN</literal> or <literal>rn</literal></entry>
+        <entry>Roman numeral (values between 1 and 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8798,6 +8798,19 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+     <listitem>
+      <para>
+       In <function>to_number()</function>, the <literal>RN</literal>
+       pattern converts Roman numerals (in standard form) to numbers.
+       Input is case-insensitive, so <literal>RN</literal>
+       and <literal>rn</literal> are equivalent.  <literal>RN</literal>
+       cannot be used in combination with any other formatting patterns or
+       modifiers except <literal>FM</literal>, which is applicable only
+       in <function>to_char()</function> and is ignored
+       in <function>to_number()</function>.
+      </para>
+     </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 3960235e14..f885ed0448 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -257,13 +256,39 @@ static const char *const rm_months_lower[] =
 {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
 
 /* ----------
- * Roman numbers
+ * Roman numerals
  * ----------
  */
 static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL};
 static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL};
 static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL};
 
+/*
+ * MACRO: Check if the current and next characters form a valid subtraction
+ * combination for roman numerals.
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/*
+ * MACRO: Roman numeral value, or 0 if character isn't a roman numeral.
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	 (r) == 'V' ? 5 : \
+	 (r) == 'X' ? 10 : \
+	 (r) == 'L' ? 50 : \
+	 (r) == 'C' ? 100 : \
+	 (r) == 'D' ? 500 : \
+	 (r) == 'M' ? 1000 : 0)
+
+/*
+ * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 characters).
+ */
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Ordinal postfixes
  * ----------
@@ -1028,6 +1053,15 @@ typedef struct NUMProc
 #define DCH_TIMED	0x02
 #define DCH_ZONED	0x04
 
+/*
+ * These macros are used in NUM_processor() and its subsidiary routines.
+ * OVERLOAD_TEST: true if we've reached end of input string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
+ */
+#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
+#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
+
+
 /* ----------
  * Functions
  * ----------
@@ -1075,6 +1109,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int	roman_to_int(NUMProc *Np, int input_len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1285,6 +1320,10 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -1316,6 +1355,13 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 			num->flag |= NUM_F_EEEE;
 			break;
 	}
+
+	if (IS_ROMAN(num) &&
+		(num->flag & ~(NUM_F_ROMAN | NUM_F_FILLMODE)) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("\"RN\" is incompatible with other formats"),
+				 errdetail("\"RN\" may only be used together with \"FM\".")));
 }
 
 /* ----------
@@ -4956,7 +5002,7 @@ int_to_roman(int number)
 			   *result,
 				numstr[12];
 
-	result = (char *) palloc(16);
+	result = (char *) palloc(MAX_ROMAN_LEN + 1);
 	*result = '\0';
 
 	/*
@@ -4966,7 +5012,7 @@ int_to_roman(int number)
 	 */
 	if (number > 3999 || number < 1)
 	{
-		fill_str(result, '#', 15);
+		fill_str(result, '#', MAX_ROMAN_LEN);
 		return result;
 	}
 
@@ -5000,6 +5046,136 @@ int_to_roman(int number)
 	return result;
 }
 
+/*
+ * Convert a roman numeral (standard form) to an integer.
+ * Result is an integer between 1 and 3999.
+ * Np->inout_p is advanced past the characters consumed.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(NUMProc *Np, int input_len)
+{
+	int			result = 0;
+	int			len;
+	char		romanChars[MAX_ROMAN_LEN];
+	int			romanValues[MAX_ROMAN_LEN];
+	int			repeatCount = 1;
+	int			vCount = 0,
+				lCount = 0,
+				dCount = 0;
+	bool		subtractionEncountered = false;
+	int			lastSubtractedValue = 0;
+
+	/*
+	 * Collect and decode valid roman numerals, consuming at most
+	 * MAX_ROMAN_LEN characters.  We do this in a separate loop to avoid
+	 * repeated decoding and because the main loop needs to know when it's at
+	 * the last numeral.
+	 */
+	for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; len++)
+	{
+		char		currChar = pg_ascii_toupper(*Np->inout_p);
+		int			currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+			break;				/* Not a valid roman numeral. */
+		romanChars[len] = currChar;
+		romanValues[len] = currValue;
+		Np->inout_p++;
+	}
+
+	if (len == 0)
+		return -1;				/* No valid roman numerals. */
+
+	/* Check for valid combinations and compute the represented value. */
+	for (int i = 0; i < len; i++)
+	{
+		char		currChar = romanChars[i];
+		int			currValue = romanValues[i];
+
+		/*
+		 * Ensure no character greater than or equal to the subtracted
+		 * character appears after a subtraction.
+		 */
+		if (subtractionEncountered && currValue >= lastSubtractedValue)
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V')
+			vCount++;
+		if (currChar == 'L')
+			lCount++;
+		if (currChar == 'D')
+			dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
+
+		if (i < len - 1)
+		{
+			char		nextChar = romanChars[i + 1];
+			int			nextValue = romanValues[i + 1];
+
+			/*
+			 * If the current value is less than the next value, handle
+			 * subtraction. Verify valid subtractive combinations and update
+			 * the result accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V')
+					vCount++;
+				if (nextChar == 'L')
+					lCount++;
+				if (nextChar == 'D')
+					dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/*
+				 * For cases where same character is repeated with subtraction
+				 * (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/*
+				 * Skip the next character as it is part of the subtractive
+				 * combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedValue = currValue;
+				result += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				result += currValue;
+			}
+		}
+		else
+		{
+			/* Add the value of the last character. */
+			result += currValue;
+		}
+	}
+
+	return result;
+}
 
 
 /* ----------
@@ -5112,14 +5288,6 @@ get_last_relevant_decnum(char *num)
 	return result;
 }
 
-/*
- * These macros are used in NUM_processor() and its subsidiary routines.
- * OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s bytes remain in string
- */
-#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
-#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
-
 /* ----------
  * Number extraction for TO_NUMBER()
  * ----------
@@ -5576,29 +5744,6 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 		return strcpy(inout, number);
 	}
 
-	/*
-	 * Roman correction
-	 */
-	if (IS_ROMAN(Np->Num))
-	{
-		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
-
-		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
-			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
-
-		if (IS_FILLMODE(Np->Num))
-		{
-			Np->Num->flag = 0;
-			Np->Num->flag |= NUM_F_FILLMODE;
-		}
-		else
-			Np->Num->flag = 0;
-		Np->Num->flag |= NUM_F_ROMAN;
-	}
-
 	/*
 	 * Sign
 	 */
@@ -5849,28 +5994,34 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 					break;
 
 				case NUM_RN:
-					if (IS_FILLMODE(Np->Num))
-					{
-						strcpy(Np->inout_p, Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					else
-					{
-						sprintf(Np->inout_p, "%15s", Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					break;
-
 				case NUM_rn:
-					if (IS_FILLMODE(Np->Num))
+					if (Np->is_to_char)
 					{
-						strcpy(Np->inout_p, asc_tolower_z(Np->number_p));
+						const char *number_p;
+
+						if (n->key->id == NUM_rn)
+							number_p = asc_tolower_z(Np->number_p);
+						else
+							number_p = Np->number_p;
+						if (IS_FILLMODE(Np->Num))
+							strcpy(Np->inout_p, number_p);
+						else
+							sprintf(Np->inout_p, "%15s", number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					else
 					{
-						sprintf(Np->inout_p, "%15s", asc_tolower_z(Np->number_p));
-						Np->inout_p += strlen(Np->inout_p) - 1;
+						int			roman_result = roman_to_int(Np, input_len);
+
+						if (roman_result < 0)
+							ereport(ERROR,
+									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+									 errmsg("invalid Roman numeral")));
+						Np->Num->pre = sprintf(Np->number_p, "%d",
+											   roman_result);
+						Np->number_p += Np->Num->pre;
+						Np->Num->post = 0;
+						continue;	/* roman_to_int ate all the chars */
 					}
 					break;
 
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 0898107ec3..dc2b930ee4 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,66 @@ SELECT to_number('123456', '99999V99');
  1234.560000000000000000
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('MMXX  ', 'RN');
+ to_number 
+-----------
+      2020
+(1 row)
+
+SELECT to_number('  XIV  ', '  RN');
+ to_number 
+-----------
+        14
+(1 row)
+
+SELECT to_number('M CC', 'RN');
+ to_number 
+-----------
+      1000
+(1 row)
+
+-- error cases
+SELECT to_number('viv', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('VV', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('IL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+DETAIL:  "RN" may only be used together with "FM".
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('', 'RN');
+ERROR:  invalid input syntax for type numeric: " "
+SELECT to_number(' ', 'RN');
+ERROR:  invalid Roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9da12c6b9e..8b7864d54a 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,32 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'FMRN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('MMXX  ', 'RN');
+SELECT to_number('  XIV  ', '  RN');
+SELECT to_number('M CC', 'RN');
+-- error cases
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('VV', 'RN');
+SELECT to_number('IL', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
+
 RESET lc_numeric;
 
 --

#21

Hunaid Sohail

hunaidpgml@gmail.com

12 months ago

In reply to: Tom Lane (#20)

Re: [PATCH] Add roman support for to_number function

Hi Tom,

On Sat, Jan 18, 2025 at 5:27 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:

Hunaid Sohail <hunaidpgml@gmail.com> writes:

I’ve attached a new patch that addresses comments 2, 3, and 4 from Tomas.

I'm still quite unhappy that this doesn't tolerate trailing
whitespace. That's not how other format patterns work, and it makes
it impossible to round-trip the output of to_char(..., 'RN').
I think the core of the problem is that you tried to cram the logic
in where the existing "not implemented" error is thrown. But on
inspection there is nothing sane about that entire "Roman correction"
stanza -- what it does is either useless (zeroing already-zeroed
fields) or in the wrong place (if we don't want to allow other flags
with IS_ROMAN, that should be done via an error in NUMDesc_prepare,
like other similar cases). In the attached I moved the roman_to_int
call into NUM_processor's main loop so it's more like other format
patterns, and fixed it to not eat any more characters than it should.
This might allow putting back some format-combination capabilities,
but other than the whitespace angle I figure we can leave that for
people to request it.

Thank you for the tweaks to the patch. Overall, it looks great.

Initially, when I looked at the test case:
SELECT to_number('M CC', 'RN');

I was confused about whether it should be 'MCC'. After further inspection,
I realized that the output is 1000 for 'M'. The format of the input can be a
bit unclear. Perhaps we could add a note in the documentation
or issue a warning in roman_to_int function when input is truncated,
to clarify this behavior.

+       bool            truncated = false;
+
+       /*
+        * Collect and decode valid roman numerals, consuming at most
+        * MAX_ROMAN_LEN characters.  We do this in a separate loop to avoid
+        * repeated decoding and because the main loop needs to know when
it's at
+        * the last numeral.
+        */
+       for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; len++)
+       {
+               char            currChar = pg_ascii_toupper(*Np->inout_p);
+               int                     currValue = ROMAN_VAL(currChar);
+
+               if (currValue == 0)
+               {
+                       truncated = true;
+                       break;                          /* Not a valid
roman numeral. */
+               }
+               romanChars[len] = currChar;
+               romanValues[len] = currValue;
+               Np->inout_p++;
+       }
+
+       if (len == 0)
+               return -1;                              /* No valid roman
numerals. */
+
+       /* Check for truncation. */
+       if (truncated)
+       {
+               ereport(WARNING,
+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+                                errmsg("Input truncated to \"%.*s\"",
+                                               len, romanChars)));
+       }

Output after change:

postgres=# SELECT to_number('MMXX ', 'RN');
WARNING: Input truncated to "MMXX"
to_number
-----------
2020
(1 row)

postgres=# SELECT to_number(' XIV ', ' RN');
WARNING: Input truncated to "XIV"
to_number
-----------
14
(1 row)

postgres=# SELECT to_number('M CC', 'RN');
WARNING: Input truncated to "M"
to_number
-----------
1000
(1 row)

Also, some modifications to the test cases will be required
if we go with these changes.

Regards,
Hunaid Sohail

#22

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Hunaid Sohail (#21)

Re: [PATCH] Add roman support for to_number function

Hunaid Sohail <hunaidpgml@gmail.com> writes:

On Sat, Jan 18, 2025 at 5:27 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:
Initially, when I looked at the test case:
SELECT to_number('M CC', 'RN');

I was confused about whether it should be 'MCC'. After further inspection,
I realized that the output is 1000 for 'M'. The format of the input can be a
bit unclear. Perhaps we could add a note in the documentation
or issue a warning in roman_to_int function when input is truncated,
to clarify this behavior.

I don't think a warning is a great idea at all. There is no other
place in formatting.c that issues warnings, and this doesn't seem
like the place to start. As an example,

regression=# select to_number('123garbage', '999999');
to_number
-----------
123
(1 row)

There's certainly a case that could be made that to_number should
be stricter about trailing garbage, but it's probably a couple
decades too late to change now. In any case it seems completely
inappropriate to me for RN to adopt that policy all by itself.

A different way that we could potentially look at this example is
that RN should be willing to skip embedded spaces, leading to 'M CC'
producing 1200 not 1000. There is a little bit of support for that
idea in the existing behavior

regression=# select to_number('123 456', '999999');
to_number
-----------
123456
(1 row)

However, when you poke at that a bit closer, it's not a license
for unlimited whitespace:

regression=# select to_number('123 456', '999999');
to_number
-----------
12345
(1 row)

I've not tracked down the exact cause of that, but I think it
has to do with the fact that NUM_numpart_from_char() is willing
to eat a single leading space per call, even if it's not the
first call. The original author's reasoning for that is lost
to history thanks to the lack of comments, but I believe that
the idea was not "eat random whitespace" but "consume a space
that was emitted as a substitute for a plus sign". The fact
that it can happen once per '9' code feels more like a bug than
something intentional. I'm not proposing that we do something
about that, at least not today, but it doesn't seem like
something we want to model RN's behavior on either. So I'm
not in favor of allowing embedded spaces.

However ... in poking at this, I noticed that I had been
mis-understanding what to_char(..., 'RN') does: it fills
to 15 characters with *leading* spaces not trailing spaces
as I'd been thinking:

regression=# select to_char(433, '|RN|');
to_char
-------------------
| CDXXXIII|
(1 row)

So on the argument that to_number's RN should be willing to
consume what to_char's RN produces, we're both wrong and
roman_to_int should be willing to eat leading spaces.
What do you think?

regards, tom lane

#23

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Tom Lane (#22)

Re: [PATCH] Add roman support for to_number function

I wrote:

However, when you poke at that a bit closer, it's not a license
for unlimited whitespace:

regression=# select to_number('123 456', '999999');
to_number
-----------
12345
(1 row)

I've not tracked down the exact cause of that, but I think it
has to do with the fact that NUM_numpart_from_char() is willing
to eat a single leading space per call, even if it's not the
first call.

I tried simply removing that bit:

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 3960235e14..366430863f 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -5134,12 +5134,6 @@ NUM_numpart_from_char(NUMProc *Np, int id, int input_len)
 		 (id == NUM_0 || id == NUM_9) ? "NUM_0/9" : id == NUM_DEC ? "NUM_DEC" : "???");
 #endif

- if (OVERLOAD_TEST)
- return;
-
- if (*Np->inout_p == ' ')
- Np->inout_p++;
-
if (OVERLOAD_TEST)
return;

All our regression tests still pass, and this example seems to behave
more sanely:

regression=# select to_number('123 456', '999999');
to_number
-----------
12345
(1 row)

regression=# select to_number('123 456', '999999');
to_number
-----------
1234
(1 row)

That might or might not be a behavior you like, but it's at least
possible to see where it's coming from: it's eating the digits that
appear within a six-character window, and not any more.

Not sure if anyone would thank us for making this change though.
I can't avoid the suspicion that somebody somewhere is depending
on the current behavior.

Anyway, this is getting off-topic for the current thread.

regards, tom lane

#24

Maciek Sakrejda

m.sakrejda@gmail.com

12 months ago

In reply to: Tom Lane (#23)

Re: [PATCH] Add roman support for to_number function

V7 passes check-world here. But, just for kicks, I generated all
possible 7-character sequences of Roman digits [1]with rd(d) as (VALUES('i'),('v'),('x'),('l'),('c'),('d'),('m')), rn as (select d1.d || d2.d || d3.d || d4.d || d5.d || d6.d || d7.d as n from rd d1, rd d2, rd d3, rd d4, rd d5, rd d6, rd d7) select 'select to_number(' || rn.n || ', ''RN'');' from rn; to confirm whether
everything either parsed cleanly or errored cleanly. Reviewing the
output, I noticed that to_number accepts some dubiously-formatted
values:

postgres=# select to_number('mmmdcm', 'RN');
to_number
-----------
4400
(1 row)

I'm assuming this was not intended, since the function comment notes
the result will be in the 1-3999 range. (And to_char fails to parse
this, failing the round-trip test.)

Thanks,
Maciek

[1]: with rd(d) as (VALUES('i'),('v'),('x'),('l'),('c'),('d'),('m')), rn as (select d1.d || d2.d || d3.d || d4.d || d5.d || d6.d || d7.d as n from rd d1, rd d2, rd d3, rd d4, rd d5, rd d6, rd d7) select 'select to_number(' || rn.n || ', ''RN'');' from rn;
rn as (select d1.d || d2.d || d3.d || d4.d || d5.d || d6.d || d7.d as
n from rd d1, rd d2, rd d3, rd d4, rd d5, rd d6, rd d7) select 'select
to_number(' || rn.n || ', ''RN'');' from rn;

#25

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Maciek Sakrejda (#24)

Re: [PATCH] Add roman support for to_number function

Maciek Sakrejda <m.sakrejda@gmail.com> writes:

V7 passes check-world here. But, just for kicks, I generated all
possible 7-character sequences of Roman digits [1] to confirm whether
everything either parsed cleanly or errored cleanly. Reviewing the
output, I noticed that to_number accepts some dubiously-formatted
values:
postgres=# select to_number('mmmdcm', 'RN');
to_number
-----------
4400
(1 row)

Ugh. This makes more urgent my question about where roman_to_int's
algorithm came from, because there's evidently something not right
about it.

regards, tom lane

#26

Hunaid Sohail

hunaidpgml@gmail.com

12 months ago

In reply to: Tom Lane (#22)

Re: [PATCH] Add roman support for to_number function

Hi,

On Mon, Jan 20, 2025 at 9:25 PM Tom Lane <tgl@sss.pgh.pa.us> wrote:

I've not tracked down the exact cause of that, but I think it
has to do with the fact that NUM_numpart_from_char() is willing
to eat a single leading space per call, even if it's not the
first call. The original author's reasoning for that is lost
to history thanks to the lack of comments, but I believe that
the idea was not "eat random whitespace" but "consume a space
that was emitted as a substitute for a plus sign". The fact
that it can happen once per '9' code feels more like a bug than
something intentional. I'm not proposing that we do something
about that, at least not today, but it doesn't seem like
something we want to model RN's behavior on either. So I'm
not in favor of allowing embedded spaces.

Agreed. We should not allow trailing and embedded spaces.
Since, trailing spaces and become embedded spaces like
'MCC ' or 'M CC'.
It means cases like these should be invalid:
SELECT to_number('MMXX ', 'RN');
SELECT to_number(' XIV ', ' RN');
SELECT to_number('M CC', 'RN');

So on the argument that to_number's RN should be willing to
consume what to_char's RN produces, we're both wrong and
roman_to_int should be willing to eat leading spaces.
What do you think?

Agreed. Your changes in v7 consumes leading spaces if leading space
is present in format (' RN'). But we need it to work on cases like:
SELECT to_number(' XIV', 'RN');
So, I can add this logic in my next patch.

Regards,
Hunaid Sohail

#27

Hunaid Sohail

hunaidpgml@gmail.com

12 months ago

In reply to: Maciek Sakrejda (#24)

Re: [PATCH] Add roman support for to_number function

Hi,

On Tue, Jan 21, 2025 at 5:19 AM Maciek Sakrejda <m.sakrejda@gmail.com>
wrote:

V7 passes check-world here. But, just for kicks, I generated all
possible 7-character sequences of Roman digits [1] to confirm whether
everything either parsed cleanly or errored cleanly. Reviewing the
output, I noticed that to_number accepts some dubiously-formatted
values:

postgres=# select to_number('mmmdcm', 'RN');
to_number
-----------
4400
(1 row)

I'm assuming this was not intended, since the function comment notes
the result will be in the 1-3999 range. (And to_char fails to parse
this, failing the round-trip test.)

Thanks,
Maciek

[1]: with rd(d) as (VALUES('i'),('v'),('x'),('l'),('c'),('d'),('m')),
rn as (select d1.d || d2.d || d3.d || d4.d || d5.d || d6.d || d7.d as
n from rd d1, rd d2, rd d3, rd d4, rd d5, rd d6, rd d7) select 'select
to_number(' || rn.n || ', ''RN'');' from rn;

Thanks for checking more cases. I feel that I found the source of
the issue. It turns out I missed the rule that V, L, and D cannot
appear before a larger numeral.

As a result, cases like the following are also invalid:

SELECT to_number('VIX', 'RN');
SELECT to_number('LXC', 'RN');
SELECT to_number('DCM', 'RN');
select to_number('MMDCM', 'RN');
select to_number('CLXC', 'RN');

I just need to add an if statement to handle these cases correctly.
I will include the fix in the next patch, and also add these cases
to the regression tests.

Regards,
Hunaid Sohail

#28

Hunaid Sohail

hunaidpgml@gmail.com

12 months ago

In reply to: Hunaid Sohail (#26)

Re: [PATCH] Add roman support for to_number function

On Tue, Jan 21, 2025 at 1:35 PM Hunaid Sohail <hunaidpgml@gmail.com> wrote:

Agreed. Your changes in v7 consumes leading spaces if leading space
is present in format (' RN'). But we need it to work on cases like:
SELECT to_number(' XIV', 'RN');
So, I can add this logic in my next patch.

So, I was playing with leading spaces in v7 and found something
interesting.

postgres=# --both input and format both 2 spaces
postgres=# SELECT to_number(' XIV', ' RN');
to_number
-----------
14
(1 row)

postgres=# --input 1 space, format 2 spaces
postgres=# --Input is read from "IV" ignoring "X"
postgres=# SELECT to_number(' XIV', ' RN');
to_number
-----------
4
(1 row)

postgres=# --should work after leading spaces are handled
postgres=# SELECT to_number(' XIV', ' RN');
ERROR: invalid Roman numeral
postgres=# select to_number('123456', ' 999999');
to_number
-----------
23456
(1 row)

postgres=# select to_number('123456', ' 999999');
to_number
-----------
3456
(1 row)

The leading spaces are consumed in the RN (from the main loop
in Num_processor), and this behavior seems consistent with how
simple numbers are handled. The Roman numeral parsing
appears to start from where "RN" is in the format after
leading spaces are consumed.

Please review the first two examples of Roman numerals.
I'd appreciate your opinions on whether this behavior should be considered
valid.

Regards,
Hunaid Sohail

#29

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Hunaid Sohail (#28)

Re: [PATCH] Add roman support for to_number function

Hunaid Sohail <hunaidpgml@gmail.com> writes:

The leading spaces are consumed in the RN (from the main loop
in Num_processor), and this behavior seems consistent with how
simple numbers are handled. The Roman numeral parsing
appears to start from where "RN" is in the format after
leading spaces are consumed.

Yes, a space in the format string should consume a character
from the input (whether it's a space or not).

regression=# select to_number('x123', '999');
to_number
-----------
12
(1 row)

regression=# select to_number('x123', ' 999');
to_number
-----------
123
(1 row)

I used to think that a space in the format would consume any number of
spaces from the input, but that doesn't seem to be true --- I think
I was misled by that perhaps-bug in NUM_numpart_from_char. For
example:

regression=# select to_number(' 123', ' 999');
to_number
-----------
12
(1 row)

One of the three input spaces is eaten by the format space, and
one is taken as the sign, and then the 9's match ' 12'. (I don't
think we should be expecting a sign in RN though.) However,
as we add more input spaces:

regression=# select to_number(' 123', ' 999');
to_number
-----------
12
(1 row)

regression=# select to_number(' 123', ' 999');
to_number
-----------
1
(1 row)

This is bizarrely inconsistent, and I think what's causing it
is the extra space-consumption in NUM_numpart_from_char.

regards, tom lane

#30

Hunaid Sohail

hunaidpgml@gmail.com

12 months ago

In reply to: Tom Lane (#29)

1 attachment(s)

Re: [PATCH] Add roman support for to_number function

Hi,

I have attached a new patch v8 with the following changes:
1. Fixed cases reported by Maciek.
2. Handles leading spaces in input string.

I have also added a few more negative tests after Maciek reported
a few cases.

I have also updated the format in the round trip test:
...to_char(i, 'FMRN') ... to
...to_char(i, 'RN') ...
which verifies that leading spaces are now handled in input
string with simple 'RN' when used with to_number.

Looking forward to your feedback.

Regards,
Hunaid Sohail

Attachments:

v8-0001-Add-roman-support-for-to_number-function.patchapplication/octet-stream; name=v8-0001-Add-roman-support-for-to_number-function.patchDownload

From 52928ce26f6875ed08a8a9dfea776c1e561aaa5e Mon Sep 17 00:00:00 2001
From: Hunaid2000 <hunaid2000@gmail.com>
Date: Wed, 22 Jan 2025 18:31:59 +0500
Subject: [PATCH v8] Add roman support for to_number function

---
 doc/src/sgml/func.sgml                |  17 +-
 src/backend/utils/adt/formatting.c    | 266 +++++++++++++++++++++-----
 src/test/regress/expected/numeric.out |  68 +++++++
 src/test/regress/sql/numeric.sql      |  32 ++++
 4 files changed, 330 insertions(+), 53 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 47370e581a..5678e7621a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8669,8 +8669,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
         <entry>plus/minus sign in specified position</entry>
        </row>
        <row>
-        <entry><literal>RN</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry><literal>RN</literal> or <literal>rn</literal></entry>
+        <entry>Roman numeral (values between 1 and 3999)</entry>
        </row>
        <row>
         <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8798,6 +8798,19 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        (e.g., <literal>9.99EEEE</literal> is a valid pattern).
       </para>
      </listitem>
+
+     <listitem>
+      <para>
+       In <function>to_number()</function>, the <literal>RN</literal>
+       pattern converts Roman numerals (in standard form) to numbers.
+       Input is case-insensitive, so <literal>RN</literal>
+       and <literal>rn</literal> are equivalent.  <literal>RN</literal>
+       cannot be used in combination with any other formatting patterns or
+       modifiers except <literal>FM</literal>, which is applicable only
+       in <function>to_char()</function> and is ignored
+       in <function>to_number()</function>.
+      </para>
+     </listitem>
     </itemizedlist>
    </para>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 3960235e14..e33d3fd3cd 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
  *	- better number building (formatting) / parsing, now it isn't
  *		  ideal code
  *	- use Assert()
- *	- add support for roman number to standard number conversion
  *	- add support for number spelling
  *	- add support for string to string formatting (we must be better
  *	  than Oracle :-),
@@ -257,13 +256,39 @@ static const char *const rm_months_lower[] =
 {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
 
 /* ----------
- * Roman numbers
+ * Roman numerals
  * ----------
  */
 static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL};
 static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL};
 static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL};
 
+/*
+ * MACRO: Check if the current and next characters form a valid subtraction
+ * combination for roman numerals.
+ */
+#define IS_VALID_SUB_COMB(curr, next) \
+	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
+	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
+	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
+
+/*
+ * MACRO: Roman numeral value, or 0 if character isn't a roman numeral.
+ */
+#define ROMAN_VAL(r) \
+	((r) == 'I' ? 1 : \
+	 (r) == 'V' ? 5 : \
+	 (r) == 'X' ? 10 : \
+	 (r) == 'L' ? 50 : \
+	 (r) == 'C' ? 100 : \
+	 (r) == 'D' ? 500 : \
+	 (r) == 'M' ? 1000 : 0)
+
+/*
+ * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 characters).
+ */
+#define MAX_ROMAN_LEN	15
+
 /* ----------
  * Ordinal postfixes
  * ----------
@@ -1028,6 +1053,15 @@ typedef struct NUMProc
 #define DCH_TIMED	0x02
 #define DCH_ZONED	0x04
 
+/*
+ * These macros are used in NUM_processor() and its subsidiary routines.
+ * OVERLOAD_TEST: true if we've reached end of input string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
+ */
+#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
+#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
+
+
 /* ----------
  * Functions
  * ----------
@@ -1075,6 +1109,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
+static int	roman_to_int(NUMProc *Np, int input_len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1285,6 +1320,10 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 
 		case NUM_rn:
 		case NUM_RN:
+			if (IS_ROMAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("cannot use \"RN\" twice")));
 			num->flag |= NUM_F_ROMAN;
 			break;
 
@@ -1316,6 +1355,13 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 			num->flag |= NUM_F_EEEE;
 			break;
 	}
+
+	if (IS_ROMAN(num) &&
+		(num->flag & ~(NUM_F_ROMAN | NUM_F_FILLMODE)) != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("\"RN\" is incompatible with other formats"),
+				 errdetail("\"RN\" may only be used together with \"FM\".")));
 }
 
 /* ----------
@@ -4956,7 +5002,7 @@ int_to_roman(int number)
 			   *result,
 				numstr[12];
 
-	result = (char *) palloc(16);
+	result = (char *) palloc(MAX_ROMAN_LEN + 1);
 	*result = '\0';
 
 	/*
@@ -4966,7 +5012,7 @@ int_to_roman(int number)
 	 */
 	if (number > 3999 || number < 1)
 	{
-		fill_str(result, '#', 15);
+		fill_str(result, '#', MAX_ROMAN_LEN);
 		return result;
 	}
 
@@ -5000,6 +5046,149 @@ int_to_roman(int number)
 	return result;
 }
 
+/*
+ * Convert a roman numeral (standard form) to an integer.
+ * Result is an integer between 1 and 3999.
+ * Np->inout_p is advanced past the characters consumed.
+ *
+ * If input is invalid, return -1.
+ */
+static int
+roman_to_int(NUMProc *Np, int input_len)
+{
+	int			result = 0;
+	int			len;
+	char		romanChars[MAX_ROMAN_LEN];
+	int			romanValues[MAX_ROMAN_LEN];
+	int			repeatCount = 1;
+	int			vCount = 0,
+				lCount = 0,
+				dCount = 0;
+	bool		subtractionEncountered = false,
+				firstNumeralFound = false;
+	int			lastSubtractedValue = 0;
+
+	/*
+	 * Collect and decode valid roman numerals, consuming at most
+	 * MAX_ROMAN_LEN characters. We do this in a separate loop to avoid
+	 * repeated decoding and because the main loop needs to know when it's at
+	 * the last numeral.
+	 */
+	for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; Np->inout_p++)
+	{
+		char		currChar = pg_ascii_toupper(*Np->inout_p);
+		int			currValue = ROMAN_VAL(currChar);
+
+		if (currValue == 0)
+		{
+			if (firstNumeralFound)
+				return -1;		/* Not a valid roman numeral. */
+
+			continue;			/* Skip leading whitespace. */
+		}
+		romanChars[len] = currChar;
+		romanValues[len] = currValue;
+		len++;
+		firstNumeralFound = true;
+	}
+
+	if (len == 0)
+		return -1;				/* No valid roman numerals. */
+
+	/* Check for valid combinations and compute the represented value. */
+	for (int i = 0; i < len; i++)
+	{
+		char		currChar = romanChars[i];
+		int			currValue = romanValues[i];
+
+		/*
+		 * Ensure no character greater than or equal to the subtracted
+		 * character appears after a subtraction.
+		 */
+		if (subtractionEncountered && currValue >= lastSubtractedValue)
+			return -1;
+
+		/* Check for invalid repetitions of characters V, L, or D. */
+		if (currChar == 'V')
+			vCount++;
+		if (currChar == 'L')
+			lCount++;
+		if (currChar == 'D')
+			dCount++;
+		if (vCount > 1 || lCount > 1 || dCount > 1)
+			return -1;
+
+		if (i < len - 1)
+		{
+			char		nextChar = romanChars[i + 1];
+			int			nextValue = romanValues[i + 1];
+
+			/*
+			 * If the current value is less than the next value, handle
+			 * subtraction. Verify valid subtractive combinations and update
+			 * the result accordingly.
+			 */
+			if (currValue < nextValue)
+			{
+				/* Check for invalid repetitions of characters V, L, or D. */
+				if (nextChar == 'V')
+					vCount++;
+				if (nextChar == 'L')
+					lCount++;
+				if (nextChar == 'D')
+					dCount++;
+				if (vCount > 1 || lCount > 1 || dCount > 1)
+					return -1;
+
+				/* V, L, and D should not appear before a larger numeral. */
+				if ((vCount && nextValue > ROMAN_VAL('V')) ||
+					(lCount && nextValue > ROMAN_VAL('L')) ||
+					(dCount && nextValue > ROMAN_VAL('D')))
+					return -1;
+
+				/*
+				 * For cases where same character is repeated with subtraction
+				 * (e.g. 'MCCM' or 'DCCCD').
+				 */
+				if (repeatCount > 1)
+					return -1;
+
+				if (!IS_VALID_SUB_COMB(currChar, nextChar))
+					return -1;
+
+				/*
+				 * Skip the next character as it is part of the subtractive
+				 * combination.
+				 */
+				i++;
+				repeatCount = 1;
+				subtractionEncountered = true;
+				lastSubtractedValue = currValue;
+				result += (nextValue - currValue);
+			}
+			else
+			{
+				/* For same characters, check for repetition. */
+				if (currChar == nextChar)
+				{
+					repeatCount++;
+					if (repeatCount > 3)
+						return -1;
+				}
+				else
+					repeatCount = 1;
+				result += currValue;
+			}
+		}
+		else
+		{
+			/* Add the value of the last character. */
+			result += currValue;
+		}
+	}
+
+	return result;
+}
 
 
 /* ----------
@@ -5112,14 +5301,6 @@ get_last_relevant_decnum(char *num)
 	return result;
 }
 
-/*
- * These macros are used in NUM_processor() and its subsidiary routines.
- * OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s bytes remain in string
- */
-#define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
-#define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
-
 /* ----------
  * Number extraction for TO_NUMBER()
  * ----------
@@ -5576,29 +5757,6 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 		return strcpy(inout, number);
 	}
 
-	/*
-	 * Roman correction
-	 */
-	if (IS_ROMAN(Np->Num))
-	{
-		if (!Np->is_to_char)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("\"RN\" not supported for input")));
-
-		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
-			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
-
-		if (IS_FILLMODE(Np->Num))
-		{
-			Np->Num->flag = 0;
-			Np->Num->flag |= NUM_F_FILLMODE;
-		}
-		else
-			Np->Num->flag = 0;
-		Np->Num->flag |= NUM_F_ROMAN;
-	}
-
 	/*
 	 * Sign
 	 */
@@ -5849,28 +6007,34 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 					break;
 
 				case NUM_RN:
-					if (IS_FILLMODE(Np->Num))
-					{
-						strcpy(Np->inout_p, Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					else
-					{
-						sprintf(Np->inout_p, "%15s", Np->number_p);
-						Np->inout_p += strlen(Np->inout_p) - 1;
-					}
-					break;
-
 				case NUM_rn:
-					if (IS_FILLMODE(Np->Num))
+					if (Np->is_to_char)
 					{
-						strcpy(Np->inout_p, asc_tolower_z(Np->number_p));
+						const char *number_p;
+
+						if (n->key->id == NUM_rn)
+							number_p = asc_tolower_z(Np->number_p);
+						else
+							number_p = Np->number_p;
+						if (IS_FILLMODE(Np->Num))
+							strcpy(Np->inout_p, number_p);
+						else
+							sprintf(Np->inout_p, "%15s", number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					else
 					{
-						sprintf(Np->inout_p, "%15s", asc_tolower_z(Np->number_p));
-						Np->inout_p += strlen(Np->inout_p) - 1;
+						int			roman_result = roman_to_int(Np, input_len);
+
+						if (roman_result < 0)
+							ereport(ERROR,
+									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+									 errmsg("invalid Roman numeral")));
+						Np->Num->pre = sprintf(Np->number_p, "%d",
+											   roman_result);
+						Np->number_p += Np->Num->pre;
+						Np->Num->post = 0;
+						continue;	/* roman_to_int ate all the chars */
 					}
 					break;
 
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 0898107ec3..46f8521711 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,74 @@ SELECT to_number('123456', '99999V99');
  1234.560000000000000000
 (1 row)
 
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+ valid 
+-------
+ t
+(1 row)
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+ to_number 
+-----------
+       108
+(1 row)
+
+SELECT to_number('  MMXX', 'RN');
+ to_number 
+-----------
+      2020
+(1 row)
+
+SELECT to_number('  XIV', '  RN');
+ to_number 
+-----------
+        14
+(1 row)
+
+-- error cases
+SELECT to_number('viv', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('DCCCD', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('XIXL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MCCM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('MMMM', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('VV', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('IL', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('VIX', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('LXC', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('DCM', 'RN');
+ERROR:  invalid Roman numeral
+select to_number('MMMDCM', 'RN');
+ERROR:  invalid Roman numeral
+select to_number('CLXC', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('  XIV  ', '  RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('M CC', 'RN');
+ERROR:  invalid Roman numeral
+SELECT to_number('CM', 'MIRN');
+ERROR:  "RN" is incompatible with other formats
+DETAIL:  "RN" may only be used together with "FM".
+SELECT to_number('CM', 'RNRN');
+ERROR:  cannot use "RN" twice
+SELECT to_number('', 'RN');
+ERROR:  invalid input syntax for type numeric: " "
+SELECT to_number(' ', 'RN');
+ERROR:  invalid Roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 9da12c6b9e..f954bd3cda 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,38 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
+
+-- Test for correct conversion between numbers and Roman numerals
+WITH rows AS
+  (SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
+SELECT
+  bool_and(to_number(roman, 'RN') = i) as valid
+FROM rows;
+
+-- Some additional tests for RN input
+SELECT to_number('CvIiI', 'rn');
+SELECT to_number('  MMXX', 'RN');
+SELECT to_number('  XIV', '  RN');
+-- error cases
+SELECT to_number('viv', 'RN');
+SELECT to_number('DCCCD', 'RN');
+SELECT to_number('XIXL', 'RN');
+SELECT to_number('MCCM', 'RN');
+SELECT to_number('MMMM', 'RN');
+SELECT to_number('VV', 'RN');
+SELECT to_number('IL', 'RN');
+SELECT to_number('VIX', 'RN');
+SELECT to_number('LXC', 'RN');
+SELECT to_number('DCM', 'RN');
+select to_number('MMMDCM', 'RN');
+select to_number('CLXC', 'RN');
+SELECT to_number('  XIV  ', '  RN');
+SELECT to_number('M CC', 'RN');
+SELECT to_number('CM', 'MIRN');
+SELECT to_number('CM', 'RNRN');
+SELECT to_number('', 'RN');
+SELECT to_number(' ', 'RN');
+
 RESET lc_numeric;
 
 --
-- 
2.34.1

#31

Tom Lane

tgl@sss.pgh.pa.us

12 months ago

In reply to: Hunaid Sohail (#30)

Re: [PATCH] Add roman support for to_number function

Hunaid Sohail <hunaidpgml@gmail.com> writes:

I have attached a new patch v8 with the following changes:
1. Fixed cases reported by Maciek.
2. Handles leading spaces in input string.

I pushed this after fooling with it a bit more:

* The way you did the leading-space skip would actually skip
anything whatsoever until it found a valid roman character.
I don't think that's what's wanted here. We just want to
skip whitespace --- the idea more or less is to consume
leading whitespace that to_char('RN') might have emitted.

* You'd also re-introduced failure on trailing whitespace
(or indeed trailing anything), which I still don't agree with.
It's not appropriate for RN to dictate that nothing can follow it.
There's a separate discussion to be had about whether to_number
is too lax about trailing garbage, but the policy should not be
different for RN than it is for other format codes.

* I made some other cosmetic changes such as rearranging the
error checks into an order that made more sense to me.

Thanks for submitting this patch!

regards, tom lane