diff options
Diffstat (limited to 'guththila')
-rw-r--r-- | guththila/src/guththila_xml_parser.c | 215 |
1 files changed, 154 insertions, 61 deletions
diff --git a/guththila/src/guththila_xml_parser.c b/guththila/src/guththila_xml_parser.c index 7c636a5..7d61254 100644 --- a/guththila/src/guththila_xml_parser.c +++ b/guththila/src/guththila_xml_parser.c @@ -401,87 +401,180 @@ guththila_un_init( return GUTHTHILA_SUCCESS; } -/* +static int +guththila_utf8_bytes_len( + unsigned int c) +{ + if (c < 0x80) + return 1; + else if (c < 0x800) + return 2; + else if (c < 0x10000) + return 3; + else if (c < 0x200000) + return 4; + else if (c < 0x4000000) + return 5; + else + return 6; +} + +/* + * Stores the given unicode char as UTF-8 + */ +static guththila_char_t* +guththila_utf8_bytes( + unsigned int c, + guththila_char_t *start) +{ + int first; + guththila_char_t *end, *p; + + if(c < 0x80) + { + *start = c; + return start + 1; + } + if(c < 0x800) + { + first = 0xc0; + end = start + 2; + } + else if(c < 0x10000) + { + first = 0xe0; + end = start + 3; + } + else if(c < 0x200000) + { + first = 0xf0; + end = start + 4; + } + else if(c < 0x4000000) + { + first = 0xf8; + end = start + 5; + } + else + { + first = 0xfc; + end = start + 6; + } + + p = end; + while(--p > start) + { + *p = (c & 0x3f) | 0x80; + c >>= 6; + } + *start = c | first; + return end; +} + + +/* * Replace the references with the corresponding actual values. */ static void guththila_token_evaluate_references( guththila_token_t * tok) { - size_t size = tok->size; guththila_char_t *start = tok->start; - guththila_char_t *pos = NULL; - size_t i, j; + guththila_char_t *end = start + tok->size; + guththila_char_t *p = start; + guththila_char_t *q = NULL; + guththila_char_t *entity = NULL; + int entity_len = 0; - pos = (guththila_char_t *)memchr(start, '&', size); - if(pos) + while(p < end && *p != '&') { - i = pos - start; + p++; } - else - { - i = size; - } - /*for(i = 0; (i < size) && (start[i] != '&'); i++) - ;*/ - if(i < size) + q = p; + while(p < end) { - j = i; - while(i < size) + /* Copy characters until the next ampersand */ + if(*p != '&') { - if(((i + 3) < size) && (start[i + 1] == 'g') && (start[i + 2] == 't') && (start[i + 3] - == ';')) - { - /* replace first char of sequence with > */ - start[j++] = '>'; - /* skip remainder of sequence */ - i += 4; - } - else if(((i + 3) < size) && (start[i + 1] == 'l') && (start[i + 2] == 't') && (start[i - + 3] == ';')) - { - /* replace first char of sequence with < */ - start[j++] = '<'; - /* skip remainder of sequence */ - i += 4; - } - else if(((i + 4) < size) && (start[i + 1] == 'a') && (start[i + 2] == 'm') && (start[i - + 3] == 'p') && (start[i + 4] == ';')) + *q++ = *p++; + continue; + } + entity = ++p; + /* Find the end of the entity, marked by ';' */ + while(p < end && *p != ';') + { + p++; + } + if(p == end) + { + break; /* Drop unterminated entity */ + } + entity_len = p - entity; + if(entity_len == 2 && entity[1] == 't') + { + if(entity[0] == 'g') + *q++ = '>'; + else if(entity[0] == 'l') + *q++ = '<'; + /* else drop */ + } + else if(entity_len == 3 && entity[0] == 'a' && entity[1] == 'm' && entity[2] == 'p') + { + *q++ = '&'; + } + else if(entity_len == 4 && entity[2] == 'o') + { + if(entity[0] == 'q' && entity[1] == 'u' && entity[3] == 't') { - /* replace first char of sequence with & */ - start[j++] = '&'; - /* skip remainder of sequence */ - i += 5; + *q++ = '"'; } - else if(((i + 5) < size) && (start[i + 1] == 'a') && (start[i + 2] == 'p') && (start[i - + 3] == 'o') && (start[i + 4] == 's') && (start[i + 5] == ';')) + else if(entity[0] == 'a' && entity[1] == 'p' && entity[3] == 's') { - /* replace first char of sequence with ' */ - start[j++] = '\''; - /* skip remainder of sequence */ - i += 6; + *q++ = '\''; } - else if(((i + 5) < size) && (start[i + 1] == 'q') && (start[i + 2] == 'u') && (start[i - + 3] == 'o') && (start[i + 4] == 't') && (start[i + 5] == ';')) - { - /* replace first char of sequence with " */ - start[j++] = '\"'; - /* skip remainder of sequence */ - i += 6; + } + else if(entity_len >= 2 && entity[0] == '#') + { + /* p points to the ';' */ + int c = 0; + guththila_char_t b; + guththila_char_t *digit = entity + 1; + if(*digit == 'x') + { /* &#x...; */ + while(++digit < p) + { + b = *digit; + if(b >= '0' && b <= '9') + c = c << 4 | (b - '0'); + else if(b >= 'A' && b <= 'F') + c = c << 4 | (b - 'A' + 10); + else if(b >= 'a' && b <= 'f') + c = c << 4 | (b - 'a' + 10); + else + break; /* stop and drop */ + } } else - { - /* ampersand does not start a sequence; - skip it and continue scanning */ - /* could insert character reference decoding here */ - start[j++] = start[i++]; + { /* &#...; */ + while(digit < p) + { + b = *digit; + if(b >= '0' && b <= '9') + c = c * 10 + (b - '0'); + else + break; /* stop and drop */ + digit++; + } } - /* copy characters downward until the next ampersand */ - for(; (i < size) && ('&' != (start[j] = start[i])); i++, j++) - ; - } - tok->size = j; + if(digit == p && c != 0) + { /* drop null char or unparsable entity */ + /* Replace the entity with the UTF-8 representation */ + q = guththila_utf8_bytes(c, q); + } + } /* else drop unknown entity */ + p++; /* go over ';' */ } + tok->size = q - start; } /* |