summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Naylor2022-07-01 10:28:20 +0000
committerJohn Naylor2022-07-11 04:11:36 +0000
commit3838fa269c15706df2b85ce2d6af8aacd5611655 (patch)
tree7c9c473754716f3a4fa661db6a133b9dac6dc309
parenta6434b951558baad8372dc4b83bf87606dac9cda (diff)
Build de-escaped JSON strings in larger chunks during lexing
During COPY BINARY with large JSONB blobs, it was found that half the time was spent parsing JSON, with much of that spent in separate appendStringInfoChar() calls for each input byte. Add lookahead loop to json_lex_string() to allow batching multiple bytes via appendBinaryStringInfo(). Also use this same logic when de-escaping is not done, to avoid code duplication. Report and proof of concept patch by Jelte Fennema, reworked by Andres Freund and John Naylor Discussion: https://www.postgresql.org/message-id/CAGECzQQuXbies_nKgSiYifZUjBk6nOf2%3DTSXqRjj2BhUh8CTeA%40mail.gmail.com Discussion: https://www.postgresql.org/message-id/flat/PR3PR83MB0476F098CBCF68AF7A1CA89FF7B49@PR3PR83MB0476.EURPRD83.prod.outlook.com
-rw-r--r--src/common/jsonapi.c58
1 files changed, 39 insertions, 19 deletions
diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index eeedc0645a..694417bb38 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -686,15 +686,6 @@ json_lex_string(JsonLexContext *lex)
lex->token_terminator = s;
return JSON_INVALID_TOKEN;
}
- else if (*s == '"')
- break;
- else if ((unsigned char) *s < 32)
- {
- /* Per RFC4627, these characters MUST be escaped. */
- /* Since *s isn't printable, exclude it from the context string */
- lex->token_terminator = s;
- return JSON_ESCAPING_REQUIRED;
- }
else if (*s == '\\')
{
/* OK, we have an escape character. */
@@ -849,22 +840,51 @@ json_lex_string(JsonLexContext *lex)
return JSON_ESCAPING_INVALID;
}
}
- else if (lex->strval != NULL)
+ else
{
+ char *p;
+
if (hi_surrogate != -1)
return JSON_UNICODE_LOW_SURROGATE;
- appendStringInfoChar(lex->strval, *s);
- }
- }
+ /*
+ * Skip to the first byte that requires special handling, so we
+ * can batch calls to appendBinaryStringInfo.
+ */
+ for (p = s; p < end; p++)
+ {
+ if (*p == '\\' || *p == '"')
+ break;
+ else if ((unsigned char) *p < 32)
+ {
+ /* Per RFC4627, these characters MUST be escaped. */
+ /*
+ * Since *p isn't printable, exclude it from the context
+ * string
+ */
+ lex->token_terminator = p;
+ return JSON_ESCAPING_REQUIRED;
+ }
+ }
- if (hi_surrogate != -1)
- return JSON_UNICODE_LOW_SURROGATE;
+ if (lex->strval != NULL)
+ appendBinaryStringInfo(lex->strval, s, p - s);
- /* Hooray, we found the end of the string! */
- lex->prev_token_terminator = lex->token_terminator;
- lex->token_terminator = s + 1;
- return JSON_SUCCESS;
+ if (*p == '"')
+ {
+ /* Hooray, we found the end of the string! */
+ lex->prev_token_terminator = lex->token_terminator;
+ lex->token_terminator = p + 1;
+ return JSON_SUCCESS;
+ }
+
+ /*
+ * s will be incremented at the top of the loop, so set it to just
+ * behind our lookahead position
+ */
+ s = p - 1;
+ }
+ }
}
/*