summaryrefslogtreecommitdiffstats
path: root/src/xplist.c
diff options
context:
space:
mode:
authorGravatar Nikias Bassen2016-12-14 03:15:42 +0100
committerGravatar Nikias Bassen2016-12-14 03:15:42 +0100
commitb2b56801b1342837d6d321a56e1f7d9b97015352 (patch)
tree259d10803d077c2f4fb65606309a199bcdfb0897 /src/xplist.c
parentd53d0412e014872b71dd9c91727234de4f08fb2f (diff)
downloadlibplist-b2b56801b1342837d6d321a56e1f7d9b97015352.tar.gz
libplist-b2b56801b1342837d6d321a56e1f7d9b97015352.tar.bz2
xplist: Improve text content parsing, reducing memory usage and unneeded copying
Diffstat (limited to 'src/xplist.c')
-rw-r--r--src/xplist.c431
1 files changed, 318 insertions, 113 deletions
diff --git a/src/xplist.c b/src/xplist.c
index 43b0422..9dd43fe 100644
--- a/src/xplist.c
+++ b/src/xplist.c
@@ -460,60 +460,153 @@ static void find_next(parse_ctx ctx, const char *nextchars, int skip_quotes)
}
}
-static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int unescape_entities)
+typedef struct {
+ const char *begin;
+ size_t length;
+ int is_cdata;
+ void *next;
+} text_part_t;
+
+static text_part_t* text_part_init(text_part_t* part, const char *begin, size_t length, int is_cdata)
+{
+ part->begin = begin;
+ part->length = length;
+ part->is_cdata = is_cdata;
+ part->next = NULL;
+ return part;
+}
+
+static void text_parts_free(text_part_t *tp)
+{
+ while (tp) {
+ text_part_t *tmp = tp;
+ tp = tp->next;
+ free(tmp);
+ }
+}
+
+static text_part_t* text_part_append(text_part_t* parts, const char *begin, size_t length, int is_cdata)
+{
+ text_part_t* newpart = malloc(sizeof(text_part_t));
+ assert(newpart);
+ parts->next = text_part_init(newpart, begin, length, is_cdata);
+ return newpart;
+}
+
+static text_part_t* get_text_parts(parse_ctx ctx, const char* tag, int skip_ws, text_part_t *parts)
{
const char *p = NULL;
const char *q = NULL;
int taglen = 0;
- char *str = NULL;
- int i = 0;
+ text_part_t *last = NULL;
if (skip_ws) {
parse_skip_ws(ctx);
}
- p = ctx->pos;
- if (strncmp(ctx->pos, "<![CDATA[", 9) == 0) {
- ctx->pos+=9;
+ do {
p = ctx->pos;
- find_str(ctx, "]]>", 0);
- if (ctx->pos >= ctx->end || strncmp(ctx->pos, "]]>", 3) != 0) {
- PLIST_XML_ERR("EOF while looking for end of CDATA block\n");
+ find_char(ctx, '<', 0);
+ if (*ctx->pos != '<') {
+ PLIST_XML_ERR("EOF while looking for closing tag\n");
+ ctx->err++;
return NULL;
}
q = ctx->pos;
- ctx->pos+=3;
- unescape_entities = 0;
- }
- find_char(ctx, '<', 0);
- if (*ctx->pos != '<') {
- PLIST_XML_ERR("EOF while looking for closing tag\n");
- return NULL;
- }
- if (!q) {
- q = ctx->pos;
- }
- ctx->pos++;
- if (ctx->pos >= ctx->end || *ctx->pos != '/') { PLIST_XML_ERR("EOF or empty tag while parsing '%s'\n",p); return NULL; }
+ ctx->pos++;
+ if (ctx->pos >= ctx->end) {
+ PLIST_XML_ERR("EOF while parsing '%s'\n", p);
+ ctx->err++;
+ return NULL;
+ }
+ if (*ctx->pos == '!') {
+ ctx->pos++;
+ if (*ctx->pos == '-' && *(ctx->pos+1) == '-') {
+ if (last) {
+ last = text_part_append(last, p, q-p, 0);
+ } else if (parts) {
+ last = text_part_init(parts, p, q-p, 0);
+ }
+ ctx->pos += 2;
+ find_str(ctx, "-->", 0);
+ if (ctx->pos >= ctx->end || strncmp(ctx->pos, "-->", 3) != 0) {
+ PLIST_XML_ERR("EOF while looking for end of comment\n");
+ ctx->err++;
+ return NULL;
+ }
+ ctx->pos += 3;
+ } else if (*ctx->pos == '[') {
+ ctx->pos++;
+ if (ctx->pos >= ctx->end - 8) {
+ PLIST_XML_ERR("EOF while parsing <[ tag\n");
+ ctx->err++;
+ return NULL;
+ }
+ if (strncmp(ctx->pos, "CDATA[", 6) == 0) {
+ if (q-p > 0) {
+ if (last) {
+ last = text_part_append(last, p, q-p, 0);
+ } else if (parts) {
+ last = text_part_init(parts, p, q-p, 0);
+ }
+ }
+ ctx->pos+=6;
+ p = ctx->pos;
+ find_str(ctx, "]]>", 0);
+ if (ctx->pos >= ctx->end || strncmp(ctx->pos, "]]>", 3) != 0) {
+ PLIST_XML_ERR("EOF while looking for end of CDATA block\n");
+ ctx->err++;
+ return NULL;
+ }
+ q = ctx->pos;
+ if (last) {
+ last = text_part_append(last, p, q-p, 1);
+ } else if (parts) {
+ last = text_part_init(parts, p, q-p, 1);
+ }
+ ctx->pos += 3;
+ } else {
+ PLIST_XML_ERR("Invalid special tag <[%.6s encountered\n", ctx->pos);
+ ctx->err++;
+ return NULL;
+ }
+ }
+ } else if (*ctx->pos == '/') {
+ break;
+ } else {
+ PLIST_XML_ERR("Invalid tag %.10s inside %s tag\n", ctx->pos, tag);
+ ctx->err++;
+ return NULL;
+ }
+ } while (1);
ctx->pos++;
taglen = strlen(tag);
- if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) { PLIST_XML_ERR("EOF or end tag mismatch\n"); return NULL;}
+ if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) {
+ PLIST_XML_ERR("EOF or end tag mismatch\n");
+ ctx->err++;
+ return NULL;
+ }
ctx->pos+=taglen;
- if (ctx->pos >= ctx->end || *ctx->pos != '>') { PLIST_XML_ERR("EOF or no '>' after tag name\n"); return NULL;}
- ctx->pos++;
- int len = q - p;
- if (len < 0) {
- PLIST_XML_ERR("Couldn't find matching '%s' end tag\n", tag);
+ if (ctx->pos >= ctx->end || *ctx->pos != '>') {
+ PLIST_XML_ERR("EOF or no '>' after tag name\n");
+ ctx->err++;
return NULL;
}
- str = malloc(len+1);
- strncpy(str, p, len);
- str[len] = 0;
+ ctx->pos++;
- if (!unescape_entities) {
- return str;
+ if (q-p > 0) {
+ if (last) {
+ last = text_part_append(last, p, q-p, 0);
+ } else if (parts) {
+ last = text_part_init(parts, p, q-p, 0);
+ }
}
+ return parts;
+}
- /* unescape entities */
+static void unescape_entities(char *str, size_t *length)
+{
+ size_t i = 0;
+ size_t len = *length;
while (i < len-1) {
if (str[i] == '&') {
char *entp = str + i + 1;
@@ -541,24 +634,24 @@ static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int u
char* ep = NULL;
if (entlen > 8) {
PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too long: &%.*s;\n", entlen, entp);
- return NULL;
+ return;
}
if (*(entp+1) == 'x' || *(entp+1) == 'X') {
if (entlen < 3) {
PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too short: &%.*s;\n", entlen, entp);
- return NULL;
+ return;
}
val = strtoull(entp+2, &ep, 16);
} else {
if (entlen < 2) {
PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too short: &%.*s;\n", entlen, entp);
- return NULL;
+ return;
}
val = strtoull(entp+1, &ep, 10);
}
if (val == 0 || val > 0x10FFFF || ep-entp != entlen) {
PLIST_XML_ERR("Invalid numerical character reference found: &%.*s;\n", entlen, entp);
- return NULL;
+ return;
}
/* convert to UTF8 */
if (val >= 0x10000) {
@@ -585,30 +678,63 @@ static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int u
}
} else {
PLIST_XML_ERR("Invalid entity encountered: &%.*s;\n", entlen, entp);
- return NULL;
+ return;
}
memmove(entp, str+i+1, len - i);
i -= entlen;
+ len -= (entlen+1);
continue;
}
}
i++;
}
-
- return str;
+ *length = len;
}
-static void skip_text_content(parse_ctx ctx, const char* tag)
+static char* text_parts_get_content(text_part_t *tp, int unesc_entities, size_t *length, int *requires_free)
{
- int taglen;
- find_char(ctx, '<', 1);
- if (*ctx->pos != '<') return;
- ctx->pos++;
- taglen = strlen(tag);
- if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) return;
- ctx->pos+=taglen;
- if (ctx->pos >= ctx->end || *ctx->pos != '>') return;
- ctx->pos++;
+ char *str = NULL;
+ size_t total_length = 0;
+
+ if (!tp) {
+ return NULL;
+ }
+ char *p;
+ if (requires_free && !tp->next) {
+ if (tp->is_cdata || !unesc_entities) {
+ *requires_free = 0;
+ if (length) {
+ *length = tp->length;
+ }
+ return (char*)tp->begin;
+ }
+ }
+ text_part_t *tmp = tp;
+ while (tp && tp->begin) {
+ total_length += tp->length;
+ tp = tp->next;
+ }
+ str = malloc(total_length + 1);
+ assert(str);
+ p = str;
+ tp = tmp;
+ while (tp && tp->begin) {
+ size_t len = tp->length;
+ strncpy(p, tp->begin, len);
+ if (!tp->is_cdata && unesc_entities) {
+ unescape_entities(p, &len);
+ }
+ p += len;
+ tp = tp->next;
+ }
+ *p = '\0';
+ if (length) {
+ *length = p - str;
+ }
+ if (requires_free) {
+ *requires_free = 1;
+ }
+ return str;
}
static void node_from_xml(parse_ctx ctx, plist_t *plist)
@@ -749,80 +875,120 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
data->type = PLIST_ARRAY;
} else if (!strcmp(tag, XPLIST_INT)) {
if (!is_empty) {
- char *str_content = get_text_content(ctx, tag, 1, 0);
- if (!str_content) {
- PLIST_XML_ERR("Couldn't find end tag for '%s'\n", tag);
- ctx->pos = ctx->end;
- ctx->err++;
+ text_part_t first_part = { NULL, 0, 0, NULL };
+ text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+ if (!tp) {
+ PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
free(tag);
free(keyname);
- return;
+ break;
}
- char *str = str_content;
- int is_negative = 0;
- if ((str[0] == '-') || (str[0] == '+')) {
- if (str[0] == '-') {
- is_negative = 1;
+ if (tp->begin) {
+ int requires_free = 0;
+ char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+ if (!str_content) {
+ PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
+ ctx->err++;
+ free(tag);
+ free(keyname);
+ break;
}
- str++;
- }
- char* endp = NULL;
- data->intval = strtoull((char*)str, &endp, 0);
- if ((endp != NULL) && (strlen(endp) > 0)) {
- PLIST_XML_ERR("integer parse error: string contains invalid characters: '%s'\n", endp);
- }
- if (is_negative || (data->intval <= INT64_MAX)) {
- int64_t v = data->intval;
- if (is_negative) {
- v = -v;
+ char *str = str_content;
+ int is_negative = 0;
+ if ((str[0] == '-') || (str[0] == '+')) {
+ if (str[0] == '-') {
+ is_negative = 1;
+ }
+ str++;
+ }
+ data->intval = strtoull((char*)str, NULL, 0);
+ if (is_negative || (data->intval <= INT64_MAX)) {
+ int64_t v = data->intval;
+ if (is_negative) {
+ v = -v;
+ }
+ data->intval = (uint64_t)v;
+ data->length = 8;
+ } else {
+ data->length = 16;
+ }
+ if (requires_free) {
+ free(str_content);
}
- data->intval = (uint64_t)v;
- data->length = 8;
} else {
- data->length = 16;
+ is_empty = 1;
}
- free(str_content);
- } else {
+ text_parts_free(tp->next);
+ }
+ if (is_empty) {
data->intval = 0;
data->length = 8;
}
data->type = PLIST_UINT;
} else if (!strcmp(tag, XPLIST_REAL)) {
if (!is_empty) {
- char *strval = get_text_content(ctx, tag, 1, 0);
- if (!strval) {
- PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
- ctx->pos = ctx->end;
- ctx->err++;
+ text_part_t first_part = { NULL, 0, 0, NULL };
+ text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+ if (!tp) {
+ PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
free(tag);
free(keyname);
- return;
+ break;
}
- data->realval = atof((char *) strval);
- free(strval);
+ if (tp->begin) {
+ int requires_free = 0;
+ char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+ if (!str_content) {
+ PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
+ ctx->err++;
+ free(tag);
+ free(keyname);
+ break;
+ }
+ data->realval = atof(str_content);
+ if (requires_free) {
+ free(str_content);
+ }
+ }
+ text_parts_free(tp->next);
}
data->type = PLIST_REAL;
data->length = 8;
} else if (!strcmp(tag, XPLIST_TRUE)) {
if (!is_empty) {
- skip_text_content(ctx, tag);
+ get_text_parts(ctx, tag, 1, NULL);
}
data->type = PLIST_BOOLEAN;
data->boolval = 1;
data->length = 1;
} else if (!strcmp(tag, XPLIST_FALSE)) {
if (!is_empty) {
- skip_text_content(ctx, tag);
+ get_text_parts(ctx, tag, 1, NULL);
}
data->type = PLIST_BOOLEAN;
data->boolval = 0;
data->length = 1;
} else if (!strcmp(tag, XPLIST_STRING) || !strcmp(tag, XPLIST_KEY)) {
if (!is_empty) {
- char *str = get_text_content(ctx, tag, 0, 1);
+ text_part_t first_part = { NULL, 0, 0, NULL };
+ text_part_t *tp = get_text_parts(ctx, tag, 0, &first_part);
+ char *str = NULL;
+ size_t length = 0;
+ if (!tp) {
+ PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
+ free(tag);
+ free(keyname);
+ break;
+ }
+ str = text_parts_get_content(tp, 1, &length, NULL);
+ text_parts_free(first_part.next);
if (!str) {
- PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
- ctx->pos = ctx->end;
+ PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
ctx->err++;
free(tag);
free(keyname);
@@ -836,7 +1002,7 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
continue;
} else {
data->strval = str;
- data->length = strlen(str);
+ data->length = length;
}
} else {
data->strval = strdup("");
@@ -845,40 +1011,79 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
data->type = PLIST_STRING;
} else if (!strcmp(tag, XPLIST_DATA)) {
if (!is_empty) {
- char *strval = get_text_content(ctx, tag, 1, 0);
- if (!strval) {
- PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
- ctx->pos = ctx->end;
- ctx->err++;
+ text_part_t first_part = { NULL, 0, 0, NULL };
+ text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+ if (!tp) {
+ PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
free(tag);
free(keyname);
- return;
+ break;
}
- size_t size = 0;
- data->buff = base64decode((char*)strval, &size);
- free(strval);
- data->length = size;
+ if (tp->begin) {
+ int requires_free = 0;
+ char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+ if (!str_content) {
+ PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
+ ctx->err++;
+ free(tag);
+ free(keyname);
+ break;
+ }
+ size_t size = tp->length;
+ data->buff = base64decode(str_content, &size);
+ data->length = size;
+
+ if (requires_free) {
+ free(str_content);
+ }
+ }
+ text_parts_free(tp->next);
}
data->type = PLIST_DATA;
} else if (!strcmp(tag, XPLIST_DATE)) {
if (!is_empty) {
- char *strval = get_text_content(ctx, tag, 1, 0);
- if (!strval) {
- PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
- ctx->pos = ctx->end;
- ctx->err++;
+ text_part_t first_part = { NULL, 0, 0, NULL };
+ text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+ if (!tp) {
+ PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
free(tag);
free(keyname);
- return;
+ break;
}
Time64_T timev = 0;
- if (strlen((const char*)strval) >= 11) {
- struct TM btime;
- parse_date((const char*)strval, &btime);
- timev = timegm64(&btime);
+ if (tp->begin) {
+ int requires_free = 0;
+ size_t length = 0;
+ char *str_content = text_parts_get_content(tp, 0, &length, &requires_free);
+ if (!str_content) {
+ PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+ text_parts_free(first_part.next);
+ ctx->err++;
+ free(tag);
+ free(keyname);
+ break;
+ }
+
+ if ((length >= 11) && (length < 32)) {
+ /* we need to copy here and 0-terminate because sscanf will read the entire string (whole rest of XML data) which can be huge */
+ char strval[32];
+ struct TM btime;
+ strncpy(strval, str_content, length);
+ strval[tp->length] = '\0';
+ parse_date(strval, &btime);
+ timev = timegm64(&btime);
+ } else {
+ PLIST_XML_ERR("Invalid text content in date node\n");
+ }
+ if (requires_free) {
+ free(str_content);
+ }
}
+ text_parts_free(tp->next);
data->realval = (double)(timev - MAC_EPOCH);
- free(strval);
}
data->length = sizeof(double);
data->type = PLIST_DATE;