/*------------------------------------------------------------------------- * * json.c * Core JSON manipulation routines used by JSON data type support. * * Copyright (c) 2010, PostgreSQL Global Development Group * Written by Joey Adams . * *------------------------------------------------------------------------- */ #include "json.h" #include "util.h" #include #include "mb/pg_wchar.h" #define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT) /* We can't use isspace() because it also accepts \v and \f, which aren't legal whitespace characters in strict JSON. */ #define is_whitespace(c) ((c)==' ' || (c)=='\t' || (c)=='\n' || (c)=='\r') static void skip_whitespace(const char **sp) { const char *s = *sp; while (is_whitespace(*s)) s++; *sp = s; } static char end_parenthesis(JSON * node) { Assert(node != NULL); switch (node->type) { case JSON_ARRAY: return ']'; case JSON_OBJECT: return '}'; default: Assert(false); return '\0'; } } /* * Reads exactly 4 hex characters (capital or lowercase). * Writes the result to *out . * Returns true on success, false if any input chars are not [0-9A-Fa-f] . */ static bool read_hex16(const char *in, unsigned int *out) { unsigned int i; unsigned int tmp; char c; *out = 0; for (i = 0; i < 4; i++) { c = *in++; if (c >= '0' && c <= '9') tmp = c - '0'; else if (c >= 'A' && c <= 'F') tmp = c - 'A' + 10; else if (c >= 'a' && c <= 'f') tmp = c - 'a' + 10; else return false; *out <<= 4; *out += tmp; } return true; } /* * Encodes a 16-bit number into hexadecimal, * writing exactly 4 hex chars. */ static void write_hex16(char *out, unsigned int val) { const char *hex = "0123456789ABCDEF"; *out++ = hex[(val >> 12) & 0xF]; *out++ = hex[(val >> 8) & 0xF]; *out++ = hex[(val >> 4) & 0xF]; *out++ = hex[val & 0xF]; } /*********** JSON creation, manipulation, and deletion **********/ JSON * json_mknode(json_type type) { JSON *node = palloc(sizeof(*node)); memset(node, 0, sizeof(*node)); node->type = type; return node; } JSON * json_mkbool(bool v_bool) { JSON *node = json_mknode(JSON_BOOL); node->v.v_bool = v_bool; return node; } JSON * json_mkstring(const char *str, size_t length) { JSON *node = json_mknode(JSON_STRING); if (str) { node->v.string.str = pnstrdup(str, length); node->v.string.length = length; } return node; } JSON * json_mknumber(const char *number, size_t length) { JSON *node = json_mknode(JSON_NUMBER); if (number) node->v.number = pnstrdup(number, length); return node; } /* * Indicate that the node's value has changed, * marking ancestors as necessary. * * Call json_touch_value so that json_encode(..., JSONOPT_ORIG) * will encode the new value rather than using original text. */ void json_touch_value(JSON * node) { while (node && node->orig.value.start) { node->orig.value.start = NULL; node = node->parent; } } /* * Add child to parent, but don't clear orig pointers of ancestors. * * This is used by json_decode to ensure that original text segments * are preserved while building the JSON tree. */ static void json_append_notouch(JSON * parent, JSON * child) { Assert(parent->type == JSON_ARRAY || parent->type == JSON_OBJECT); Assert(child->parent == NULL); parent->v.children.count++; child->parent = parent; child->prev = parent->v.children.tail; child->next = NULL; if (parent->v.children.tail) { parent->v.children.tail->next = child; parent->v.children.tail = child; } else { parent->v.children.head = parent->v.children.tail = child; } } /* * json_append * Add child to parent, putting it at the end of its child list. * * Child must not already have another parent. */ void json_append(JSON * parent, JSON * child) { json_append_notouch(parent, child); json_touch_value(parent); } /* * json_remove * Remove node from its parent, but do not delete it. */ void json_remove(JSON * node) { JSON *parent = node->parent; if (parent == NULL) return; Assert(parent->type == JSON_ARRAY || parent->type == JSON_OBJECT); Assert(parent->v.children.count > 0); if (node->prev) node->prev->next = node->next; else parent->v.children.head = node->next; if (node->next) node->next->prev = node->prev; else parent->v.children.tail = node->prev; parent->v.children.count--; node->parent = NULL; node->prev = NULL; node->next = NULL; json_touch_value(parent); } /* * Update the value of a node, preserving position and key information. * * Note well: If replacement is an array or object with children, the parent * pointers of those children will be incorrect * (they'll still refer to their original parent). * * Untrustworthy parent pointers is the price to pay for * being able to copy JSON values by reference. */ void json_replace_value(JSON * node, JSON * replacement) { node->type = replacement->type; node->v = replacement->v; node->orig.value = replacement->orig.value; if (node->parent) json_touch_value(node->parent); } const char * json_get_string(JSON * node, size_t *length_out) { Assert(node->type == JSON_STRING); if (length_out) *length_out = node->v.string.length; return node->v.string.str; } void json_set_string(JSON * node, const char *str, size_t length) { Assert(node->type == JSON_STRING); if (node->v.string.str) pfree(node->v.string.str); if (str) { node->v.string.str = pnstrdup(str, length); node->v.string.length = length; } else { node->v.string.str = NULL; node->v.string.length = 0; } json_touch_value(node); } const char * json_get_number(JSON * node) { Assert(node->type == JSON_NUMBER); return node->v.number; } void json_set_number(JSON * node, const char *number, size_t length) { Assert(node->type == JSON_NUMBER); if (node->v.number) pfree(node->v.number); if (number) node->v.number = pnstrdup(number, length); else node->v.number = NULL; json_touch_value(node); } /* Non-recursively free a node */ static void free_node(JSON * node) { if (node->type == JSON_STRING) { if (node->v.string.str) pfree(node->v.string.str); } else if (node->type == JSON_NUMBER) { if (node->v.number) pfree(node->v.number); } if (node->key) pfree(node->key); pfree(node); } /* * Free a JSON node and all its descendants. * * Do not use this function if you have performed json_replace_value on * a descendant, as this function relies on each node's ->parent field * being trustworthy. */ static void json_delete(JSON * node) { JSON *parent, *next; if (node == NULL) return; /* Remove node from parent (if it has one). */ json_remove(node); descend: while (is_internal(node) && node->v.children.head) node = node->v.children.head; advance: parent = node->parent; next = node->next; free_node(node); node = next; if (node != NULL) { goto descend; } else { node = parent; if (node != NULL) goto advance; else return; } } /*********************** Parsing and validation **********************/ static JSON *decode_leaf(const char **sp); static JSON *decode_number(const char **sp); /* * json_decode_string has a different signature than its friends * because it's also used to parse object member keys. * It's also useful outside of json.c, such as in jsonpath.c . */ char *json_decode_string(const char **sp, size_t *length, bool strict); /* * json_validate * Make sure the given UTF-8 string is valid JSON. * * TODO: Consider making a dedicated function for this so we don't have to * convert to UTF-8, build a JSON node, then free both * whenever we need to validate (such as in json_in and json_recv). */ bool json_validate(const char *str) { JSON *node = json_decode(str); if (node == NULL) return false; json_delete(node); return true; } /* * json_validate_server_encoded * Variant of json_validate that takes a server-encoded string * rather than a UTF-8 string. * * Note that a dedicated json_validate (described in the TODO above) * would be able to handle both encodings natively, since both are * ASCII-compatible. */ bool json_validate_server_encoded(const char *str) { char *str_utf8 = server_to_utf8(str, strlen(str)); bool result = json_validate(str_utf8); if (str_utf8 != str) pfree(str_utf8); return result; } /* * json_decode * Convert a JSON-encoded string to a JSON node. * @str must be valid UTF-8. */ JSON * json_decode(const char *str) { JSON *root = NULL, *parent = NULL, *node = NULL; const char *s = str; char *key; size_t key_length; struct json_orig orig; bool expect_endp; if (str == NULL) return NULL; Assert(utf8_validate(str, strlen(str))); expect_endp = false; goto item; item: /* Expect a value (set expect_endp before goto * item; ) */ key = NULL; key_length = 0; memset(&orig, 0, sizeof(orig)); orig.key_left_space.start = s; orig.left_space.start = s; skip_whitespace(&s); if (expect_endp) { if (*s == ']' || *s == '}') goto endp; } if (parent != NULL && parent->type == JSON_OBJECT) { /* Parse member key string. */ orig.key_left_space.end = s; orig.key.start = s; key = json_decode_string(&s, &key_length, true); if (key == NULL) goto failed; orig.key.end = s; orig.key_right_space.start = s; /* Eat the " : " */ skip_whitespace(&s); if (*s != ':') goto failed; orig.key_right_space.end = s; s++; orig.left_space.start = s; skip_whitespace(&s); } /* * The way orig.value and company are initialized is a bit funky. If this * node has children, we have to finish parsing the node's children before * we know where it ends. Hence, initialization of orig.value_end and * after will be deferred if this node has children. */ orig.left_space.end = s; orig.value.start = s; node = decode_leaf(&s); if (node == NULL) { if (*s == '[') node = json_mknode(JSON_ARRAY); else if (*s == '{') node = json_mknode(JSON_OBJECT); else goto failed; s++; /* * orig.value.end and later are dangling (actually NULL) for now, but * will be initialized when we get to state 'endp' . */ } else { orig.value.end = s; orig.right_space.start = s; skip_whitespace(&s); orig.right_space.end = s; } node->key = key; node->key_length = key_length; /* * The key now belongs to the node. This prevents a double free on * failure (see the failed: label). */ key = NULL; node->orig = orig; if (parent != NULL) json_append_notouch(parent, node); else root = node; if (is_internal(node)) { /* * "push" node onto the "stack". Nodes point up to their parents, * which is why this function doesn't need a "stack" per se. */ parent = node; expect_endp = true; goto item; } if (parent != NULL) goto comma_endp; else goto end; comma_endp: /* Expect a comma or end bracket/brace */ if (*s == ',') { s++; expect_endp = false; goto item; } if (*s == ']' || *s == '}') goto endp; goto failed; endp: /* Handle an end bracket/brace */ if (*s != end_parenthesis(parent)) goto failed; s++; /* "pop" a node from the "stack" */ node = parent; parent = parent->parent; /* * The other pointers were set when we started parsing this node in the * 'item' state. */ node->orig.value.end = s; node->orig.right_space.start = s; skip_whitespace(&s); node->orig.right_space.end = s; if (parent != NULL) goto comma_endp; else goto end; end: /* Expect end of text */ if (*s != '\0') goto failed; return node; failed: /* Handle failure */ if (key != NULL) pfree(key); json_delete(root); return NULL; } /* * Decode and skip a node that does not have children. * Whitespace is not skipped first (it is done in the primary decode loop). * * Returns NULL if next character is '[', '{', or invalid. */ static JSON * decode_leaf(const char **sp) { char c = **sp; if (c == '"') { size_t length; char *str = json_decode_string(sp, &length, true); if (str != NULL) { JSON *node = json_mknode(JSON_STRING); node->v.string.str = str; node->v.string.length = length; return node; } return NULL; } if ((c >= '0' && c <= '9') || c == '-') return decode_number(sp); if (strncmp(*sp, "true", 4) == 0) { (*sp) += 4; return json_mkbool(true); } if (strncmp(*sp, "false", 5) == 0) { (*sp) += 5; return json_mkbool(false); } if (strncmp(*sp, "null", 4) == 0) { (*sp) += 4; return json_mknode(JSON_NULL); } return NULL; } /* * The JSON spec says that a number shall follow this precise pattern * (spaces and quotes added for readability): * '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)? * * However, some JSON parsers are more liberal. For instance, PHP accepts * '.5' and '1.'. JSON.parse accepts '+3'. * * This function takes the strict approach. */ static bool validate_number(const char **sp) { const char *s = *sp; /* '-'? */ if (*s == '-') s++; /* (0 | [1-9][0-9]*) */ if (*s == '0') { s++; } else { if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } /* ('.' [0-9]+)? */ if (*s == '.') { s++; if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } /* ([Ee] [+-]? [0-9]+)? */ if (*s == 'E' || *s == 'e') { s++; if (*s == '+' || *s == '-') s++; if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } *sp = s; return true; } static JSON * decode_number(const char **sp) { const char *start, *end; start = *sp; if (!validate_number(sp)) return NULL; end = *sp; return json_mknumber(start, end - start); } /* * json_decode_string * If you're interested in the decoding JSON in general, see json_decode. * * Decodes a JSON string literal (e.g. "\"hello\""). * * If strict is true, string must be double-quoted, * as is required by the JSON RFC. * Otherwise (e.g. if parsing something JSON-like, such as JSONPath), * the string may be single- or double-quoted. * * Also, no whitespace skipping is done, so the caller should only * call this function when it expects **sp to be either " or ' * * On success, returns the decoded string, passes the decoded string's * length through *length (which must not be NULL), and advances *sp to point * to the end of string literal (after the closing quote character). * * On failure (parse error), returns NULL and * leaves *length and *sp untouched. */ char * json_decode_string(const char **sp, size_t *length, bool strict) { const char *s = *sp; StringInfoData ret; char buf[4]; int len; char quote; Assert(length != NULL); initStringInfo(&ret); quote = *s++; if (strict) { if (quote != '"') return NULL; } else { if (quote != '"' && quote != '\'') return NULL; } while (*s != '\0' && *s != quote) { unsigned char c = *s++; unsigned int uc; unsigned int lc; if (c == '\\') { c = *s++; switch (c) { case '\\': case '/': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'u': if (!read_hex16(s, &uc)) goto failed; s += 4; if (uc >= 0xD800 && uc <= 0xDFFF) { /* Handle UTF-16 surrogate pair. */ if (uc >= 0xDC00) goto failed; /* Second surrogate not * preceded by first * surrogate. */ if (s[0] != '\\' || s[1] != 'u' || !read_hex16(s + 2, &lc) || !(lc >= 0xDC00 && lc <= 0xDFFF)) goto failed; /* First surrogate not * followed by second * surrogate. */ s += 6; uc = 0x10000 + (((uc & 0x3FF) << 10) | (lc & 0x3FF)); } unicode_to_utf8(uc, (unsigned char *) buf); len = pg_utf_mblen((unsigned char *) buf); Assert(len > 0); appendBinaryStringInfo(&ret, buf, len); continue; /* Continue the enclosing while loop to skip * the str_append below. */ default: /* Invalid escape */ if (c == quote) break; if (!strict && (c == '"' || c == '\'')) break; goto failed; /* Invalid escape */ } } else if (c <= 0x1F) { /* Control characters not allowed in string literals. */ goto failed; } appendStringInfoChar(&ret, c); } if (*s++ != quote) goto failed; *length = ret.len; *sp = s; return ret.data; failed: pfree(ret.data); return NULL; } /* * json_text_type * Determines the type of a JSON string without fully decoding it. * Expects the given string to be valid JSON. * Might return JSON_INVALID if something is wrong with the input. */ json_type json_text_type(const char *str, size_t nbytes) { const char *s = str; const char *e = str + nbytes; char c; /* Skip whitespace characters. */ while (s < e && is_whitespace(*s)) s++; /* Get first non-white character, making sure it's in bounds. */ if (s >= e) return JSON_INVALID; c = *s; switch (c) { case 'n': return JSON_NULL; case '"': return JSON_STRING; case 't': case 'f': return JSON_BOOL; case '{': return JSON_OBJECT; case '[': return JSON_ARRAY; default: if (c == '-' || (c >= '0' && c <= '9')) return JSON_NUMBER; return JSON_INVALID; } } /****************************** Encoding *****************************/ /* * encode_string * Variant of json_encode_string that writes its output to a StringInfo. */ static void encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode) { const char *s = string; const char *e = s + length; Assert(quote != '\\'); if (escape_unicode) Assert(utf8_validate(string, length)); appendStringInfoChar(out, quote); while (s < e) { unsigned char c = *s++; unsigned char endchar; switch (c) { case '\\': endchar = '\\'; break; case '\b': endchar = 'b'; break; case '\f': endchar = 'f'; break; case '\n': endchar = 'n'; break; case '\r': endchar = 'r'; break; case '\t': endchar = 't'; break; default: { if (c == quote) { endchar = quote; break; } if (c < 0x1F || (c >= 0x80 && escape_unicode)) { /* Encode using \u.... */ pg_wchar uc; unsigned int lc; char txt[13]; s--; uc = utf8_decode_char(&s); txt[0] = '\\'; txt[1] = 'u'; txt[6] = '\\'; txt[7] = 'u'; if (uc <= 0xFFFF) { write_hex16(txt + 2, uc); txt[6] = '\0'; } else { uc -= 0x10000; lc = uc & 0x3FF; uc = uc >> 10; uc |= 0xD800; lc |= 0xDC00; write_hex16(txt + 2, uc); write_hex16(txt + 8, lc); txt[12] = '\0'; } appendStringInfoString(out, txt); continue; /* Skip backslash-encoding code below. */ } endchar = '\0'; } } appendStringInfoChar(out, endchar ? '\\' : c); if (endchar != '\0') appendStringInfoChar(out, endchar); } appendStringInfoChar(out, quote); } static bool encode_number(StringInfo out, const char *string) { const char *s = string; const char *start, *end; if (string == NULL) return false; /* Validate number, trimming whitespace. */ skip_whitespace(&s); start = s; if (!validate_number(&s)) return false; end = s; skip_whitespace(&s); if (*s != '\0') return false; /* Append number to out */ appendBinaryStringInfo(out, start, end - start); return true; } typedef struct { StringInfoData str; bool use_orig; bool escape_unicode; bool trim; } json_encode_ctx; static bool json_encode_recurse(JSON * node, json_encode_ctx * ctx); /* * json_encode * Encode a JSON node. * * The JSONOPT_ESCAPE_UNICODE option may only be used * if the strings in the JSON tree are UTF-8-encoded. */ char * json_encode(JSON * node, int options) { json_encode_ctx ctx; initStringInfo(&ctx.str); ctx.use_orig = !!(options & JSONOPT_USE_ORIG); ctx.escape_unicode = !!(options & JSONOPT_ESCAPE_UNICODE); ctx.trim = !(options & JSONOPT_NO_TRIM); if (!json_encode_recurse(node, &ctx)) { pfree(ctx.str.data); return NULL; } return ctx.str.data; } static bool json_encode_recurse(JSON * node, json_encode_ctx * ctx) { #define has_orig(field) \ (use_orig && node->orig.field.start) #define push_orig(field) \ appendBinaryStringInfo(&ctx->str, node->orig.field.start, \ node->orig.field.end - node->orig.field.start) bool use_orig = ctx->use_orig; bool trim = ctx->trim; ctx->trim = false; /* Don't trim internal nodes, just the root * node. */ if (!trim && has_orig(left_space)) push_orig(left_space); if (has_orig(value)) { push_orig(value); } else { const char *txt = NULL; JSON *child; switch (node->type) { case JSON_NULL: txt = "null"; break; case JSON_BOOL: if (node->v.v_bool) txt = "true"; else txt = "false"; break; case JSON_STRING: encode_string(&ctx->str, node->v.string.str, node->v.string.length, '"', ctx->escape_unicode); break; case JSON_NUMBER: if (!encode_number(&ctx->str, node->v.number)) return false; break; case JSON_ARRAY: appendStringInfoChar(&ctx->str, '['); json_foreach(child, node) { json_encode_recurse(child, ctx); if (child->next != NULL) appendStringInfoChar(&ctx->str, ','); } appendStringInfoChar(&ctx->str, ']'); break; case JSON_OBJECT: appendStringInfoChar(&ctx->str, '{'); json_foreach(child, node) { /* * Shadows the parent node (assigned to the variable * @node) so we can use our macros on the child node * instead. Hurray for lexical scoping! */ JSON *node = child; if (has_orig(key_left_space)) push_orig(key_left_space); if (has_orig(key)) push_orig(key); else encode_string(&ctx->str, node->key, node->key_length, '"', ctx->escape_unicode); if (has_orig(key_right_space)) push_orig(key_right_space); appendStringInfoChar(&ctx->str, ':'); json_encode_recurse(node, ctx); if (node->next != NULL) appendStringInfoChar(&ctx->str, ','); } appendStringInfoChar(&ctx->str, '}'); break; default: return false; } if (txt != NULL) appendStringInfoString(&ctx->str, txt); } if (!trim && has_orig(right_space)) push_orig(right_space); return true; #undef has_orig #undef push_orig } /* * json_encode_string * If you're interested in encoding JSON in general, see json_encode . * * Encodes a string literal JSON-style using the given quote character. * Note that using anything but '"' as the quote character will result * in invalid JSON. * * If escape_unicode is true, str must be valid UTF-8. * In any case, str may contain null characters (hence the length argument). * * quote must not be a backslash. */ char * json_encode_string(const char *str, size_t length, char quote, bool escape_unicode) { StringInfoData ret; initStringInfo(&ret); encode_string(&ret, str, length, quote, escape_unicode); return ret.data; }