Jsonpath ** vs lax mode
Hi!
We have a bug report which says that jsonpath ** operator behaves strangely
in the lax mode [1].
Naturally, the result of this query looks counter-intuitive.
# select jsonb_path_query_array('[{"a": 1, "b": [{"a": 2}]}]', 'lax
$.**.a');
jsonb_path_query_array
------------------------
[1, 1, 2, 2]
(1 row)
But actually, everything works as designed. ** operator reports both
objects and wrapping arrays, while object key accessor automatically
unwraps arrays.
# select x, jsonb_path_query_array(x, '$.a') from jsonb_path_query('[{"a":
1, "b": [{"a": 2}]}]', 'lax $.**') x;
x | jsonb_path_query_array
-----------------------------+------------------------
[{"a": 1, "b": [{"a": 2}]}] | [1]
{"a": 1, "b": [{"a": 2}]} | [1]
1 | []
[{"a": 2}] | [2]
{"a": 2} | [2]
2 | []
(6 rows)
At first sight, we may just say that lax mode just sucks and
counter-intuitive results are expected. But at the second sight, the lax
mode is used by default and current behavior may look too surprising.
My proposal is to make everything after the ** operator use strict mode
(patch attached). I think this shouldn't be backpatched, just applied to
the v14. Other suggestions?
Links
1.
/messages/by-id/16828-2b0229babfad2d8c@postgresql.org
------
Regards,
Alexander Korotkov
Attachments:
jsonpath_double_star_strict_mode.patchapplication/octet-stream; name=jsonpath_double_star_strict_mode.patchDownload
commit a3bd6a3c34569cd12be7259947bbef2f52cc99bc
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date: Wed Jan 20 20:03:05 2021 +0300
Initial.
Reported-by:
Bug:
Discussion:
Author:
Reviewed-by:
Tested-by:
Backpatch-through:
diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c
index 4d185c27b47..5d7b0752bc0 100644
--- a/src/backend/utils/adt/jsonpath_exec.c
+++ b/src/backend/utils/adt/jsonpath_exec.c
@@ -198,7 +198,8 @@ static JsonPathBool executeNestedBoolItem(JsonPathExecContext *cxt,
static JsonPathExecResult executeAnyItem(JsonPathExecContext *cxt,
JsonPathItem *jsp, JsonbContainer *jbc, JsonValueList *found,
uint32 level, uint32 first, uint32 last,
- bool ignoreStructuralErrors, bool unwrapNext);
+ bool ignoreStructuralErrors, bool unwrapNext,
+ bool strictMode);
static JsonPathBool executePredicate(JsonPathExecContext *cxt,
JsonPathItem *pred, JsonPathItem *larg, JsonPathItem *rarg,
JsonbValue *jb, bool unwrapRightArg,
@@ -860,7 +861,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
return executeAnyItem
(cxt, hasNext ? &elem : NULL,
jb->val.binary.data, found, 1, 1, 1,
- false, jspAutoUnwrap(cxt));
+ false, jspAutoUnwrap(cxt), false);
}
else if (unwrap && JsonbType(jb) == jbvArray)
return executeItemUnwrapTargetArray(cxt, jsp, jb, found, false);
@@ -926,12 +927,16 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
if (jsp->content.anybounds.first == 0)
{
bool savedIgnoreStructuralErrors;
+ bool savedLaxMode;
savedIgnoreStructuralErrors = cxt->ignoreStructuralErrors;
+ savedLaxMode = cxt->laxMode;
cxt->ignoreStructuralErrors = true;
+ cxt->laxMode = false;
res = executeNextItem(cxt, jsp, &elem,
jb, found, true);
cxt->ignoreStructuralErrors = savedIgnoreStructuralErrors;
+ cxt->laxMode = savedLaxMode;
if (res == jperOk && !found)
break;
@@ -944,7 +949,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
1,
jsp->content.anybounds.first,
jsp->content.anybounds.last,
- true, jspAutoUnwrap(cxt));
+ true, false, true);
break;
}
@@ -1128,7 +1133,7 @@ executeItemUnwrapTargetArray(JsonPathExecContext *cxt, JsonPathItem *jsp,
return executeAnyItem
(cxt, jsp, jb->val.binary.data, found, 1, 1, 1,
- false, unwrapElements);
+ false, unwrapElements, false);
}
/*
@@ -1375,7 +1380,7 @@ executeNestedBoolItem(JsonPathExecContext *cxt, JsonPathItem *jsp,
static JsonPathExecResult
executeAnyItem(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbContainer *jbc,
JsonValueList *found, uint32 level, uint32 first, uint32 last,
- bool ignoreStructuralErrors, bool unwrapNext)
+ bool ignoreStructuralErrors, bool unwrapNext, bool strictMode)
{
JsonPathExecResult res = jperNotFound;
JsonbIterator *it;
@@ -1410,17 +1415,18 @@ executeAnyItem(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbContainer *jbc,
/* check expression */
if (jsp)
{
- if (ignoreStructuralErrors)
- {
- bool savedIgnoreStructuralErrors;
+ bool savedIgnoreStructuralErrors;
+ bool savedLaxMode;
- savedIgnoreStructuralErrors = cxt->ignoreStructuralErrors;
+ savedIgnoreStructuralErrors = cxt->ignoreStructuralErrors;
+ savedLaxMode = cxt->laxMode;
+ if (ignoreStructuralErrors)
cxt->ignoreStructuralErrors = true;
- res = executeItemOptUnwrapTarget(cxt, jsp, &v, found, unwrapNext);
- cxt->ignoreStructuralErrors = savedIgnoreStructuralErrors;
- }
- else
- res = executeItemOptUnwrapTarget(cxt, jsp, &v, found, unwrapNext);
+ if (strictMode)
+ cxt->laxMode = false;
+ res = executeItemOptUnwrapTarget(cxt, jsp, &v, found, unwrapNext);
+ cxt->ignoreStructuralErrors = savedIgnoreStructuralErrors;
+ cxt->laxMode = savedLaxMode;
if (jperIsError(res))
break;
@@ -1439,7 +1445,7 @@ executeAnyItem(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbContainer *jbc,
res = executeAnyItem
(cxt, jsp, v.val.binary.data, found,
level + 1, first, last,
- ignoreStructuralErrors, unwrapNext);
+ ignoreStructuralErrors, unwrapNext, strictMode);
if (jperIsError(res))
break;
diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out
index 508ddd797ed..d0dfa4f076b 100644
--- a/src/test/regress/expected/jsonb_jsonpath.out
+++ b/src/test/regress/expected/jsonb_jsonpath.out
@@ -707,6 +707,20 @@ select jsonb_path_query('{"a": {"c": {"b": 1}}}', 'lax $.**{2 to 3}.b ? (@ > 0)'
1
(1 row)
+select jsonb_path_query('[{"a": 1, "b": [{"a": 2}]}]', 'lax $.**.a');
+ jsonb_path_query
+------------------
+ 1
+ 2
+(2 rows)
+
+select jsonb_path_query('[{"a": 1, "b": [{"a": 2}]}]', 'strict $.**.a');
+ jsonb_path_query
+------------------
+ 1
+ 2
+(2 rows)
+
select jsonb '{"a": {"b": 1}}' @? '$.**.b ? ( @ > 0)';
?column?
----------
diff --git a/src/test/regress/sql/jsonb_jsonpath.sql b/src/test/regress/sql/jsonb_jsonpath.sql
index 60f73cb0590..d8d89d73739 100644
--- a/src/test/regress/sql/jsonb_jsonpath.sql
+++ b/src/test/regress/sql/jsonb_jsonpath.sql
@@ -136,6 +136,8 @@ select jsonb_path_query('{"a": {"c": {"b": 1}}}', 'lax $.**{0 to last}.b ? (@ >
select jsonb_path_query('{"a": {"c": {"b": 1}}}', 'lax $.**{1 to last}.b ? (@ > 0)');
select jsonb_path_query('{"a": {"c": {"b": 1}}}', 'lax $.**{1 to 2}.b ? (@ > 0)');
select jsonb_path_query('{"a": {"c": {"b": 1}}}', 'lax $.**{2 to 3}.b ? (@ > 0)');
+select jsonb_path_query('[{"a": 1, "b": [{"a": 2}]}]', 'lax $.**.a');
+select jsonb_path_query('[{"a": 1, "b": [{"a": 2}]}]', 'strict $.**.a');
select jsonb '{"a": {"b": 1}}' @? '$.**.b ? ( @ > 0)';
select jsonb '{"a": {"b": 1}}' @? '$.**{0}.b ? ( @ > 0)';
On 2021-Jan-20, Alexander Korotkov wrote:
My proposal is to make everything after the ** operator use strict mode
(patch attached). I think this shouldn't be backpatched, just applied to
the v14. Other suggestions?
I think changing the mode midway through the operation is strange. What
do you think of requiring for ** that mode is strict? That is, if ** is
used and the mode is lax, an error is thrown.
Thanks
--
�lvaro Herrera Valdivia, Chile
Hi, Alvaro!
Thank you for your feedback.
On Wed, Jan 20, 2021 at 9:16 PM Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:
On 2021-Jan-20, Alexander Korotkov wrote:
My proposal is to make everything after the ** operator use strict mode
(patch attached). I think this shouldn't be backpatched, just applied to
the v14. Other suggestions?I think changing the mode midway through the operation is strange. What
do you think of requiring for ** that mode is strict? That is, if ** is
used and the mode is lax, an error is thrown.
Yes, changing mode in midway is a bit strange.
Requiring strict mode for ** is a solution, but probably too restrictive...
What do you think about making just subsequent accessor after ** not
to unwrap arrays. That would be a bit tricky to implement, but
probably that would better satisfy the user needs.
------
Regards,
Alexander Korotkov
Alexander Korotkov schrieb am 20.01.2021 um 18:13:
We have a bug report which says that jsonpath ** operator behaves strangely in the lax mode [1].
That report was from me ;)
Thanks for looking into it.
At first sight, we may just say that lax mode just sucks and
counter-intuitive results are expected. But at the second sight, the
lax mode is used by default and current behavior may look too
surprising.
I personally would be fine with the manual stating that the Postgres extension
to the JSONPath processing that allows a recursive lookup using ** requires strict
mode to work properly.
It should probably be documented in chapter 9.16.2 "The SQL/JSON Path Language",
maybe with a little warning in the description of jsonb_path_query** and in
chapter 8.14.16 as well (or at least that's were I would expect such a warning)
Regards
Thomas
On 2021-Jan-21, Alexander Korotkov wrote:
Requiring strict mode for ** is a solution, but probably too restrictive...
What do you think about making just subsequent accessor after ** not
to unwrap arrays. That would be a bit tricky to implement, but
probably that would better satisfy the user needs.
Hmm, why is it too restrictive? If the user needs to further drill into
the JSON, can't they chain json_path_query calls, specifying (or
defaulting to) lax mode for the part doesn't include the ** expression?
--
�lvaro Herrera Valdivia, Chile
On Thu, Jan 21, 2021 at 12:38 PM Thomas Kellerer <shammat@gmx.net> wrote:
Alexander Korotkov schrieb am 20.01.2021 um 18:13:
We have a bug report which says that jsonpath ** operator behaves strangely in the lax mode [1].
That report was from me ;)
Thanks for looking into it.
At first sight, we may just say that lax mode just sucks and
counter-intuitive results are expected. But at the second sight, the
lax mode is used by default and current behavior may look too
surprising.I personally would be fine with the manual stating that the Postgres extension
to the JSONPath processing that allows a recursive lookup using ** requires strict
mode to work properly.It should probably be documented in chapter 9.16.2 "The SQL/JSON Path Language",
maybe with a little warning in the description of jsonb_path_query** and in
chapter 8.14.16 as well (or at least that's were I would expect such a warning)
Thank you for reporting :)
Yeah, documenting the current behavior is something "must have". If
even we find the appropriate behavior change, I don't think it would
be backpatchable. But we need to backpatch the documentation for
sure. So, let's start by fixing the docs.
------
Regards,
Alexander Korotkov
On Thu, Jan 21, 2021 at 4:35 PM Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:
On 2021-Jan-21, Alexander Korotkov wrote:
Requiring strict mode for ** is a solution, but probably too restrictive...
What do you think about making just subsequent accessor after ** not
to unwrap arrays. That would be a bit tricky to implement, but
probably that would better satisfy the user needs.Hmm, why is it too restrictive? If the user needs to further drill into
the JSON, can't they chain json_path_query calls, specifying (or
defaulting to) lax mode for the part doesn't include the ** expression?
For sure, there are some walkarounds. But I don't think all the
lax-mode queries involving ** are affected. So, it might happen that
we force users to use strict-mode or chain call even if it's not
necessary. I'm tending to just fix the doc and wait if there are mode
complaints :)
------
Regards,
Alexander Korotkov
On Mon, Jan 25, 2021 at 6:33 PM Alexander Korotkov <aekorotkov@gmail.com> wrote:
On Thu, Jan 21, 2021 at 4:35 PM Alvaro Herrera <alvherre@alvh.no-ip.org> wrote:
On 2021-Jan-21, Alexander Korotkov wrote:
Requiring strict mode for ** is a solution, but probably too restrictive...
What do you think about making just subsequent accessor after ** not
to unwrap arrays. That would be a bit tricky to implement, but
probably that would better satisfy the user needs.Hmm, why is it too restrictive? If the user needs to further drill into
the JSON, can't they chain json_path_query calls, specifying (or
defaulting to) lax mode for the part doesn't include the ** expression?For sure, there are some walkarounds. But I don't think all the
lax-mode queries involving ** are affected. So, it might happen that
we force users to use strict-mode or chain call even if it's not
necessary. I'm tending to just fix the doc and wait if there are mode
complaints :)
The patch, which clarifies this situation in the docs is attached.
I'm going to push it if no objections.
------
Regards,
Alexander Korotkov
Attachments:
jsonpath-double-star-lax-docs.patchapplication/octet-stream; name=jsonpath-double-star-lax-docs.patchDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index fd0370a1b43..7a748392cbc 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -16236,6 +16236,24 @@ strict $.track.segments[*].location
</programlisting>
</para>
+ <para>
+ Note that <literal>.**</literal> accessor could lead to surprising results
+ when using the lax mode. For instance, the following query selects every
+ <literal>HR</literal> value twice.
+<programlisting>
+lax $.**.HR
+</programlisting>
+ This happens because <literal>.**</literal> accessor selects both
+ <literal>segments</literal> array and each of its elements, while
+ <literal>.HR</literal> accessor automatically unwraps arrays when using
+ the lax mode. To evade surprising results, we recommend the usage of
+ <literal>.**</literal> accessor in the strict mode. The following query
+ selects each <literal>HR</literal> value once.
+<programlisting>
+strict $.**.HR
+</programlisting>
+ </para>
+
</sect3>
<sect3 id="functions-sqljson-path-operators">
Alexander Korotkov <aekorotkov@gmail.com> writes:
The patch, which clarifies this situation in the docs is attached.
I'm going to push it if no objections.
+1, but the English in this seems a bit shaky. Perhaps more
like the attached?
regards, tom lane
Attachments:
jsonpath-double-star-lax-docs-2.patchtext/x-diff; charset=us-ascii; name=jsonpath-double-star-lax-docs-2.patchDownload
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4342c8e74f..de1b1b6deb 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -16277,6 +16277,24 @@ strict $.track.segments[*].location
</programlisting>
</para>
+ <para>
+ The <literal>.**</literal> accessor can lead to surprising results
+ when using the lax mode. For instance, the following query selects every
+ <literal>HR</literal> value twice:
+<programlisting>
+lax $.**.HR
+</programlisting>
+ This happens because the <literal>.**</literal> accessor selects both
+ the <literal>segments</literal> array and each of its elements, while
+ the <literal>.HR</literal> accessor automatically unwraps arrays when
+ using the lax mode. To avoid surprising results, we recommend using
+ the <literal>.**</literal> accessor only in the strict mode. The
+ following query selects each <literal>HR</literal> value just once:
+<programlisting>
+strict $.**.HR
+</programlisting>
+ </para>
+
</sect3>
<sect3 id="functions-sqljson-path-operators">