summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--guththila/src/guththila_xml_parser.c215
1 files changed, 154 insertions, 61 deletions
diff --git a/guththila/src/guththila_xml_parser.c b/guththila/src/guththila_xml_parser.c
index 7c636a5..7d61254 100644
--- a/guththila/src/guththila_xml_parser.c
+++ b/guththila/src/guththila_xml_parser.c
@@ -401,87 +401,180 @@ guththila_un_init(
return GUTHTHILA_SUCCESS;
}
-/*
+static int
+guththila_utf8_bytes_len(
+ unsigned int c)
+{
+ if (c < 0x80)
+ return 1;
+ else if (c < 0x800)
+ return 2;
+ else if (c < 0x10000)
+ return 3;
+ else if (c < 0x200000)
+ return 4;
+ else if (c < 0x4000000)
+ return 5;
+ else
+ return 6;
+}
+
+/*
+ * Stores the given unicode char as UTF-8
+ */
+static guththila_char_t*
+guththila_utf8_bytes(
+ unsigned int c,
+ guththila_char_t *start)
+{
+ int first;
+ guththila_char_t *end, *p;
+
+ if(c < 0x80)
+ {
+ *start = c;
+ return start + 1;
+ }
+ if(c < 0x800)
+ {
+ first = 0xc0;
+ end = start + 2;
+ }
+ else if(c < 0x10000)
+ {
+ first = 0xe0;
+ end = start + 3;
+ }
+ else if(c < 0x200000)
+ {
+ first = 0xf0;
+ end = start + 4;
+ }
+ else if(c < 0x4000000)
+ {
+ first = 0xf8;
+ end = start + 5;
+ }
+ else
+ {
+ first = 0xfc;
+ end = start + 6;
+ }
+
+ p = end;
+ while(--p > start)
+ {
+ *p = (c & 0x3f) | 0x80;
+ c >>= 6;
+ }
+ *start = c | first;
+ return end;
+}
+
+
+/*
* Replace the references with the corresponding actual values.
*/
static void
guththila_token_evaluate_references(
guththila_token_t * tok)
{
- size_t size = tok->size;
guththila_char_t *start = tok->start;
- guththila_char_t *pos = NULL;
- size_t i, j;
+ guththila_char_t *end = start + tok->size;
+ guththila_char_t *p = start;
+ guththila_char_t *q = NULL;
+ guththila_char_t *entity = NULL;
+ int entity_len = 0;
- pos = (guththila_char_t *)memchr(start, '&', size);
- if(pos)
+ while(p < end && *p != '&')
{
- i = pos - start;
+ p++;
}
- else
- {
- i = size;
- }
- /*for(i = 0; (i < size) && (start[i] != '&'); i++)
- ;*/
- if(i < size)
+ q = p;
+ while(p < end)
{
- j = i;
- while(i < size)
+ /* Copy characters until the next ampersand */
+ if(*p != '&')
{
- if(((i + 3) < size) && (start[i + 1] == 'g') && (start[i + 2] == 't') && (start[i + 3]
- == ';'))
- {
- /* replace first char of sequence with > */
- start[j++] = '>';
- /* skip remainder of sequence */
- i += 4;
- }
- else if(((i + 3) < size) && (start[i + 1] == 'l') && (start[i + 2] == 't') && (start[i
- + 3] == ';'))
- {
- /* replace first char of sequence with < */
- start[j++] = '<';
- /* skip remainder of sequence */
- i += 4;
- }
- else if(((i + 4) < size) && (start[i + 1] == 'a') && (start[i + 2] == 'm') && (start[i
- + 3] == 'p') && (start[i + 4] == ';'))
+ *q++ = *p++;
+ continue;
+ }
+ entity = ++p;
+ /* Find the end of the entity, marked by ';' */
+ while(p < end && *p != ';')
+ {
+ p++;
+ }
+ if(p == end)
+ {
+ break; /* Drop unterminated entity */
+ }
+ entity_len = p - entity;
+ if(entity_len == 2 && entity[1] == 't')
+ {
+ if(entity[0] == 'g')
+ *q++ = '>';
+ else if(entity[0] == 'l')
+ *q++ = '<';
+ /* else drop */
+ }
+ else if(entity_len == 3 && entity[0] == 'a' && entity[1] == 'm' && entity[2] == 'p')
+ {
+ *q++ = '&';
+ }
+ else if(entity_len == 4 && entity[2] == 'o')
+ {
+ if(entity[0] == 'q' && entity[1] == 'u' && entity[3] == 't')
{
- /* replace first char of sequence with & */
- start[j++] = '&';
- /* skip remainder of sequence */
- i += 5;
+ *q++ = '"';
}
- else if(((i + 5) < size) && (start[i + 1] == 'a') && (start[i + 2] == 'p') && (start[i
- + 3] == 'o') && (start[i + 4] == 's') && (start[i + 5] == ';'))
+ else if(entity[0] == 'a' && entity[1] == 'p' && entity[3] == 's')
{
- /* replace first char of sequence with ' */
- start[j++] = '\'';
- /* skip remainder of sequence */
- i += 6;
+ *q++ = '\'';
}
- else if(((i + 5) < size) && (start[i + 1] == 'q') && (start[i + 2] == 'u') && (start[i
- + 3] == 'o') && (start[i + 4] == 't') && (start[i + 5] == ';'))
- {
- /* replace first char of sequence with " */
- start[j++] = '\"';
- /* skip remainder of sequence */
- i += 6;
+ }
+ else if(entity_len >= 2 && entity[0] == '#')
+ {
+ /* p points to the ';' */
+ int c = 0;
+ guththila_char_t b;
+ guththila_char_t *digit = entity + 1;
+ if(*digit == 'x')
+ { /* &#x...; */
+ while(++digit < p)
+ {
+ b = *digit;
+ if(b >= '0' && b <= '9')
+ c = c << 4 | (b - '0');
+ else if(b >= 'A' && b <= 'F')
+ c = c << 4 | (b - 'A' + 10);
+ else if(b >= 'a' && b <= 'f')
+ c = c << 4 | (b - 'a' + 10);
+ else
+ break; /* stop and drop */
+ }
}
else
- {
- /* ampersand does not start a sequence;
- skip it and continue scanning */
- /* could insert character reference decoding here */
- start[j++] = start[i++];
+ { /* &#...; */
+ while(digit < p)
+ {
+ b = *digit;
+ if(b >= '0' && b <= '9')
+ c = c * 10 + (b - '0');
+ else
+ break; /* stop and drop */
+ digit++;
+ }
}
- /* copy characters downward until the next ampersand */
- for(; (i < size) && ('&' != (start[j] = start[i])); i++, j++)
- ;
- }
- tok->size = j;
+ if(digit == p && c != 0)
+ { /* drop null char or unparsable entity */
+ /* Replace the entity with the UTF-8 representation */
+ q = guththila_utf8_bytes(c, q);
+ }
+ } /* else drop unknown entity */
+ p++; /* go over ';' */
}
+ tok->size = q - start;
}
/*