From b2b56801b1342837d6d321a56e1f7d9b97015352 Mon Sep 17 00:00:00 2001
From: Nikias Bassen
Date: Wed, 14 Dec 2016 03:15:42 +0100
Subject: xplist: Improve text content parsing, reducing memory usage and
 unneeded copying

---
 src/xplist.c | 431 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 318 insertions(+), 113 deletions(-)

diff --git a/src/xplist.c b/src/xplist.c
index 43b0422..9dd43fe 100644
--- a/src/xplist.c
+++ b/src/xplist.c
@@ -460,60 +460,153 @@ static void find_next(parse_ctx ctx, const char *nextchars, int skip_quotes)
     }
 }
 
-static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int unescape_entities)
+typedef struct {
+    const char *begin;
+    size_t length;
+    int is_cdata;
+    void *next;
+} text_part_t;
+
+static text_part_t* text_part_init(text_part_t* part, const char *begin, size_t length, int is_cdata)
+{
+    part->begin = begin;
+    part->length = length;
+    part->is_cdata = is_cdata;
+    part->next = NULL;
+    return part;
+}
+
+static void text_parts_free(text_part_t *tp)
+{
+    while (tp) {
+        text_part_t *tmp = tp;
+        tp = tp->next;
+        free(tmp);
+    }
+}
+
+static text_part_t* text_part_append(text_part_t* parts, const char *begin, size_t length, int is_cdata)
+{
+    text_part_t* newpart = malloc(sizeof(text_part_t));
+    assert(newpart);
+    parts->next = text_part_init(newpart, begin, length, is_cdata);
+    return newpart;
+}
+
+static text_part_t* get_text_parts(parse_ctx ctx, const char* tag, int skip_ws, text_part_t *parts)
 {
     const char *p = NULL;
     const char *q = NULL;
     int taglen = 0;
-    char *str = NULL;
-    int i = 0;
+    text_part_t *last = NULL;
 
     if (skip_ws) {
         parse_skip_ws(ctx);
     }
-    p = ctx->pos;
-    if (strncmp(ctx->pos, "<![CDATA[", 9) == 0) {
-        ctx->pos+=9;
+    do {
         p = ctx->pos;
-        find_str(ctx, "]]>", 0);
-        if (ctx->pos >= ctx->end || strncmp(ctx->pos, "]]>", 3) != 0) {
-            PLIST_XML_ERR("EOF while looking for end of CDATA block\n");
+        find_char(ctx, '<', 0);
+        if (*ctx->pos != '<') {
+            PLIST_XML_ERR("EOF while looking for closing tag\n");
+            ctx->err++;
             return NULL;
         }
         q = ctx->pos;
-        ctx->pos+=3;
-        unescape_entities = 0;
-    }
-    find_char(ctx, '<', 0);
-    if (*ctx->pos != '<') {
-        PLIST_XML_ERR("EOF while looking for closing tag\n");
-        return NULL;
-    }
-    if (!q) {
-        q = ctx->pos;
-    }
-    ctx->pos++;
-    if (ctx->pos >= ctx->end || *ctx->pos != '/') { PLIST_XML_ERR("EOF or empty tag while parsing '%s'\n",p); return NULL; }
+        ctx->pos++;
+        if (ctx->pos >= ctx->end) {
+            PLIST_XML_ERR("EOF while parsing '%s'\n", p);
+            ctx->err++;
+            return NULL;
+        }
+        if (*ctx->pos == '!') {
+            ctx->pos++;
+            if (*ctx->pos == '-' && *(ctx->pos+1) == '-') {
+                if (last) {
+                    last = text_part_append(last, p, q-p, 0);
+                } else if (parts) {
+                    last = text_part_init(parts, p, q-p, 0);
+                }
+                ctx->pos += 2;
+                find_str(ctx, "-->", 0);
+                if (ctx->pos >= ctx->end || strncmp(ctx->pos, "-->", 3) != 0) {
+                    PLIST_XML_ERR("EOF while looking for end of comment\n");
+                    ctx->err++;
+                    return NULL;
+                }
+                ctx->pos += 3;
+            } else if (*ctx->pos == '[') {
+                ctx->pos++;
+                if (ctx->pos >= ctx->end - 8) {
+                    PLIST_XML_ERR("EOF while parsing <[ tag\n");
+                    ctx->err++;
+                    return NULL;
+                }
+                if (strncmp(ctx->pos, "CDATA[", 6) == 0) {
+                    if (q-p > 0) {
+                        if (last) {
+                            last = text_part_append(last, p, q-p, 0);
+                        } else if (parts) {
+                            last = text_part_init(parts, p, q-p, 0);
+                        }
+                    }
+                    ctx->pos+=6;
+                    p = ctx->pos;
+                    find_str(ctx, "]]>", 0);
+                    if (ctx->pos >= ctx->end || strncmp(ctx->pos, "]]>", 3) != 0) {
+                        PLIST_XML_ERR("EOF while looking for end of CDATA block\n");
+                        ctx->err++;
+                        return NULL;
+                    }
+                    q = ctx->pos;
+                    if (last) {
+                        last = text_part_append(last, p, q-p, 1);
+                    } else if (parts) {
+                        last = text_part_init(parts, p, q-p, 1);
+                    }
+                    ctx->pos += 3;
+                } else {
+                    PLIST_XML_ERR("Invalid special tag <[%.6s encountered\n", ctx->pos);
+                    ctx->err++;
+                    return NULL;
+                }
+            }
+        } else if (*ctx->pos == '/') {
+            break;
+        } else {
+            PLIST_XML_ERR("Invalid tag %.10s inside %s tag\n", ctx->pos, tag);
+            ctx->err++;
+            return NULL;
+        }
+    } while (1);
     ctx->pos++;
     taglen = strlen(tag);
-    if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) { PLIST_XML_ERR("EOF or end tag mismatch\n"); return NULL;}
+    if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) {
+        PLIST_XML_ERR("EOF or end tag mismatch\n");
+        ctx->err++;
+        return NULL;
+    }
     ctx->pos+=taglen;
-    if (ctx->pos >= ctx->end || *ctx->pos != '>') { PLIST_XML_ERR("EOF or no '>' after tag name\n"); return NULL;}
-    ctx->pos++;
-    int len = q - p;
-    if (len < 0) {
-        PLIST_XML_ERR("Couldn't find matching '%s' end tag\n", tag);
+    if (ctx->pos >= ctx->end || *ctx->pos != '>') {
+        PLIST_XML_ERR("EOF or no '>' after tag name\n");
+        ctx->err++;
         return NULL;
     }
-    str = malloc(len+1);
-    strncpy(str, p, len);
-    str[len] = 0;
+    ctx->pos++;
 
-    if (!unescape_entities) {
-        return str;
+    if (q-p > 0) {
+        if (last) {
+            last = text_part_append(last, p, q-p, 0);
+        } else if (parts) {
+            last = text_part_init(parts, p, q-p, 0);
+        }
     }
+    return parts;
+}
 
-    /* unescape entities */
+static void unescape_entities(char *str, size_t *length)
+{
+    size_t i = 0;
+    size_t len = *length;
     while (i < len-1) {
         if (str[i] == '&') {
             char *entp = str + i + 1;
@@ -541,24 +634,24 @@ static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int u
                     char* ep = NULL;
                     if (entlen > 8) {
                         PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too long: &%.*s;\n", entlen, entp);
-                        return NULL;
+                        return;
                     }
                     if (*(entp+1) == 'x' || *(entp+1) == 'X') {
                         if (entlen < 3) {
                             PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too short: &%.*s;\n", entlen, entp);
-                            return NULL;
+                            return;
                         }
                         val = strtoull(entp+2, &ep, 16);
                     } else {
                         if (entlen < 2) {
                             PLIST_XML_ERR("Invalid numerical character reference encountered, sequence too short: &%.*s;\n", entlen, entp);
-                            return NULL;
+                            return;
                         }
                         val = strtoull(entp+1, &ep, 10);
                     }
                     if (val == 0 || val > 0x10FFFF || ep-entp != entlen) {
                         PLIST_XML_ERR("Invalid numerical character reference found: &%.*s;\n", entlen, entp);
-                        return NULL;
+                        return;
                     }
                     /* convert to UTF8 */
                     if (val >= 0x10000) {
@@ -585,30 +678,63 @@ static char* get_text_content(parse_ctx ctx, const char* tag, int skip_ws, int u
                     }
                 } else {
                     PLIST_XML_ERR("Invalid entity encountered: &%.*s;\n", entlen, entp);
-                    return NULL;
+                    return;
                 }
                 memmove(entp, str+i+1, len - i);
                 i -= entlen;
+                len -= (entlen+1);
                 continue;
             }
         }
         i++;
     }
-
-    return str;
+    *length = len;
 }
 
-static void skip_text_content(parse_ctx ctx, const char* tag)
+static char* text_parts_get_content(text_part_t *tp, int unesc_entities, size_t *length, int *requires_free)
 {
-    int taglen;
-    find_char(ctx, '<', 1);
-    if (*ctx->pos != '<') return;
-    ctx->pos++;
-    taglen = strlen(tag);
-    if (ctx->pos >= ctx->end-taglen || strncmp(ctx->pos, tag, taglen)) return;
-    ctx->pos+=taglen;
-    if (ctx->pos >= ctx->end || *ctx->pos != '>') return;
-    ctx->pos++;
+    char *str = NULL;
+    size_t total_length = 0;
+
+    if (!tp) {
+        return NULL;
+    }
+    char *p;
+    if (requires_free && !tp->next) {
+        if (tp->is_cdata || !unesc_entities) {
+            *requires_free = 0;
+            if (length) {
+                *length = tp->length;
+            }
+            return (char*)tp->begin;
+        }
+    }
+    text_part_t *tmp = tp;
+    while (tp && tp->begin) {
+        total_length += tp->length;
+        tp = tp->next;
+    }
+    str = malloc(total_length + 1);
+    assert(str);
+    p = str;
+    tp = tmp;
+    while (tp && tp->begin) {
+        size_t len = tp->length;
+        strncpy(p, tp->begin, len);
+        if (!tp->is_cdata && unesc_entities) {
+            unescape_entities(p, &len);
+        }
+        p += len;
+        tp = tp->next;
+    }
+    *p = '\0';
+    if (length) {
+        *length = p - str;
+    }
+    if (requires_free) {
+        *requires_free = 1;
+    }
+    return str;
 }
 
 static void node_from_xml(parse_ctx ctx, plist_t *plist)
@@ -749,80 +875,120 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
                 data->type = PLIST_ARRAY;
             } else if (!strcmp(tag, XPLIST_INT)) {
                 if (!is_empty) {
-                    char *str_content = get_text_content(ctx, tag, 1, 0);
-                    if (!str_content) {
-                        PLIST_XML_ERR("Couldn't find end tag for '%s'\n", tag);
-                        ctx->pos = ctx->end;
-                        ctx->err++;
+                    text_part_t first_part = { NULL, 0, 0, NULL };
+                    text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+                    if (!tp) {
+                        PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+                        text_parts_free(first_part.next);
                         free(tag);
                         free(keyname);
-                        return;
+                        break;
                     }
-                    char *str = str_content;
-                    int is_negative = 0;
-                    if ((str[0] == '-') || (str[0] == '+')) {
-                        if (str[0] == '-') {
-                            is_negative = 1;
+                    if (tp->begin) {
+                        int requires_free = 0;
+                        char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+                        if (!str_content) {
+                            PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+                            text_parts_free(first_part.next);
+                            ctx->err++;
+                            free(tag);
+                            free(keyname);
+                            break;
                         }
-                        str++;
-                    }
-                    char* endp = NULL;
-                    data->intval = strtoull((char*)str, &endp, 0);
-                    if ((endp != NULL) && (strlen(endp) > 0)) {
-                        PLIST_XML_ERR("integer parse error: string contains invalid characters: '%s'\n", endp);
-                    }
-                    if (is_negative || (data->intval <= INT64_MAX)) {
-                        int64_t v = data->intval;
-                        if (is_negative) {
-                            v = -v;
+                        char *str = str_content;
+                        int is_negative = 0;
+                        if ((str[0] == '-') || (str[0] == '+')) {
+                            if (str[0] == '-') {
+                                is_negative = 1;
+                            }
+                            str++;
+                        }
+                        data->intval = strtoull((char*)str, NULL, 0);
+                        if (is_negative || (data->intval <= INT64_MAX)) {
+                            int64_t v = data->intval;
+                            if (is_negative) {
+                                v = -v;
+                            }
+                            data->intval = (uint64_t)v;
+                            data->length = 8;
+                        } else {
+                            data->length = 16;
+                        }
+                        if (requires_free) {
+                            free(str_content);
                         }
-                        data->intval = (uint64_t)v;
-                        data->length = 8;
                     } else {
-                        data->length = 16;
+                        is_empty = 1;
                     }
-                    free(str_content);
-                } else {
+                    text_parts_free(tp->next);
+                }
+                if (is_empty) {
                     data->intval = 0;
                     data->length = 8;
                 }
                 data->type = PLIST_UINT;
             } else if (!strcmp(tag, XPLIST_REAL)) {
                 if (!is_empty) {
-                    char *strval = get_text_content(ctx, tag, 1, 0);
-                    if (!strval) {
-                        PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
-                        ctx->pos = ctx->end;
-                        ctx->err++;
+                    text_part_t first_part = { NULL, 0, 0, NULL };
+                    text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+                    if (!tp) {
+                        PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+                        text_parts_free(first_part.next);
                         free(tag);
                         free(keyname);
-                        return;
+                        break;
                     }
-                    data->realval = atof((char *) strval);
-                    free(strval);
+                    if (tp->begin) {
+                        int requires_free = 0;
+                        char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+                        if (!str_content) {
+                            PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+                            text_parts_free(first_part.next);
+                            ctx->err++;
+                            free(tag);
+                            free(keyname);
+                            break;
+                        }
+                        data->realval = atof(str_content);
+                        if (requires_free) {
+                            free(str_content);
+                        }
+                    }
+                    text_parts_free(tp->next);
                 }
                 data->type = PLIST_REAL;
                 data->length = 8;
             } else if (!strcmp(tag, XPLIST_TRUE)) {
                 if (!is_empty) {
-                    skip_text_content(ctx, tag);
+                    get_text_parts(ctx, tag, 1, NULL);
                 }
                 data->type = PLIST_BOOLEAN;
                 data->boolval = 1;
                 data->length = 1;
             } else if (!strcmp(tag, XPLIST_FALSE)) {
                 if (!is_empty) {
-                    skip_text_content(ctx, tag);
+                    get_text_parts(ctx, tag, 1, NULL);
                 }
                 data->type = PLIST_BOOLEAN;
                 data->boolval = 0;
                 data->length = 1;
             } else if (!strcmp(tag, XPLIST_STRING) || !strcmp(tag, XPLIST_KEY)) {
                 if (!is_empty) {
-                    char *str = get_text_content(ctx, tag, 0, 1);
+                    text_part_t first_part = { NULL, 0, 0, NULL };
+                    text_part_t *tp = get_text_parts(ctx, tag, 0, &first_part);
+                    char *str = NULL;
+                    size_t length = 0;
+                    if (!tp) {
+                        PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+                        text_parts_free(first_part.next);
+                        free(tag);
+                        free(keyname);
+                        break;
+                    }
+                    str = text_parts_get_content(tp, 1, &length, NULL);
+                    text_parts_free(first_part.next);
                     if (!str) {
-                        PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
-                        ctx->pos = ctx->end;
+                        PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
                         ctx->err++;
                         free(tag);
                         free(keyname);
@@ -836,7 +1002,7 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
                         continue;
                     } else {
                         data->strval = str;
-                        data->length = strlen(str);
+                        data->length = length;
                     }
                 } else {
                     data->strval = strdup("");
@@ -845,40 +1011,79 @@ static void node_from_xml(parse_ctx ctx, plist_t *plist)
                 data->type = PLIST_STRING;
             } else if (!strcmp(tag, XPLIST_DATA)) {
                 if (!is_empty) {
-                    char *strval = get_text_content(ctx, tag, 1, 0);
-                    if (!strval) {
-                        PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
-                        ctx->pos = ctx->end;
-                        ctx->err++;
+                    text_part_t first_part = { NULL, 0, 0, NULL };
+                    text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+                    if (!tp) {
+                        PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+                        text_parts_free(first_part.next);
                         free(tag);
                         free(keyname);
-                        return;
+                        break;
                     }
-                    size_t size = 0;
-                    data->buff = base64decode((char*)strval, &size);
-                    free(strval);
-                    data->length = size;
+                    if (tp->begin) {
+                        int requires_free = 0;
+                        char *str_content = text_parts_get_content(tp, 0, NULL, &requires_free);
+                        if (!str_content) {
+                            PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+                            text_parts_free(first_part.next);
+                            ctx->err++;
+                            free(tag);
+                            free(keyname);
+                            break;
+                        }
+                        size_t size = tp->length;
+                        data->buff = base64decode(str_content, &size);
+                        data->length = size;
+
+                        if (requires_free) {
+                            free(str_content);
+                        }
+                    }
+                    text_parts_free(tp->next);
                 }
                 data->type = PLIST_DATA;
             } else if (!strcmp(tag, XPLIST_DATE)) {
                 if (!is_empty) {
-                    char *strval = get_text_content(ctx, tag, 1, 0);
-                    if (!strval) {
-                        PLIST_XML_ERR("Couldn't get text content for '%s' node\n", tag);
-                        ctx->pos = ctx->end;
-                        ctx->err++;
+                    text_part_t first_part = { NULL, 0, 0, NULL };
+                    text_part_t *tp = get_text_parts(ctx, tag, 1, &first_part);
+                    if (!tp) {
+                        PLIST_XML_ERR("Could not parse text content for '%s' node\n", tag);
+                        text_parts_free(first_part.next);
                         free(tag);
                         free(keyname);
-                        return;
+                        break;
                     }
                     Time64_T timev = 0;
-                    if (strlen((const char*)strval) >= 11) {
-                        struct TM btime;
-                        parse_date((const char*)strval, &btime);
-                        timev = timegm64(&btime);
+                    if (tp->begin) {
+                        int requires_free = 0;
+                        size_t length = 0;
+                        char *str_content = text_parts_get_content(tp, 0, &length, &requires_free);
+                        if (!str_content) {
+                            PLIST_XML_ERR("Could not get text content for '%s' node\n", tag);
+                            text_parts_free(first_part.next);
+                            ctx->err++;
+                            free(tag);
+                            free(keyname);
+                            break;
+                        }
+
+                        if ((length >= 11) && (length < 32)) {
+                            /* we need to copy here and 0-terminate because sscanf will read the entire string (whole rest of XML data) which can be huge */
+                            char strval[32];
+                            struct TM btime;
+                            strncpy(strval, str_content, length);
+                            strval[tp->length] = '\0';
+                            parse_date(strval, &btime);
+                            timev = timegm64(&btime);
+                        } else {
+                            PLIST_XML_ERR("Invalid text content in date node\n");
+                        }
+                        if (requires_free) {
+                            free(str_content);
+                        }
                     }
+                    text_parts_free(tp->next);
                     data->realval = (double)(timev - MAC_EPOCH);
-                    free(strval);
                 }
                 data->length = sizeof(double);
                 data->type = PLIST_DATE;
-- 
cgit v1.1-32-gdbae