summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar shane2013-10-17 15:02:23 -0400
committerGravatar shane2013-10-17 15:02:23 -0400
commit95d1d0cc91f36ece9da39270cd1ca0536d0cf02d (patch)
tree9d2431f1bfa3c4a562cd006bc22ef2b7280d7b66
parente9995aee384bdbac6c6afc0f4548a080eb4241bc (diff)
downloadlibplist-95d1d0cc91f36ece9da39270cd1ca0536d0cf02d.tar.gz
libplist-95d1d0cc91f36ece9da39270cd1ca0536d0cf02d.tar.bz2
UTF-16 surrogate pair fix
Handle UTF-16 surrogate pair conversion to/from UTF-8
-rw-r--r--src/bplist.c41
1 files changed, 37 insertions, 4 deletions
diff --git a/src/bplist.c b/src/bplist.c
index 5db7096..d825fc8 100644
--- a/src/bplist.c
+++ b/src/bplist.c
@@ -286,15 +286,38 @@ static plist_t parse_string_node(char *bnode, uint64_t size)
static char *plist_utf16_to_utf8(uint16_t *unistr, long len, long *items_read, long *items_written)
{
if (!unistr || (len <= 0)) return NULL;
- char *outbuf = (char*)malloc(3*(len+1));
+ char *outbuf = (char*)malloc(4*(len+1));
int p = 0;
int i = 0;
uint16_t wc;
+ uint32_t w;
+ int read_lead_surrogate = 0;
while (i < len) {
wc = unistr[i++];
- if (wc >= 0x800) {
+ if (wc >= 0xD800 && wc <= 0xDBFF) {
+ if (!read_lead_surrogate) {
+ read_lead_surrogate = 1;
+ w = 0x010000 + ((wc & 0x3FF) << 10);
+ } else {
+ // This is invalid, the next 16 bit char should be a trail surrogate.
+ // Handling error by skipping.
+ read_lead_surrogate = 0;
+ }
+ } else if (wc >= 0xDC00 && wc <= 0xDFFF) {
+ if (read_lead_surrogate) {
+ read_lead_surrogate = 0;
+ w = w | (wc & 0x3FF);
+ outbuf[p++] = (char)(0xF0 + ((w >> 18) & 0x3));
+ outbuf[p++] = (char)(0x80 + ((w >> 12) & 0x3F));
+ outbuf[p++] = (char)(0x80 + ((w >> 6) & 0x3F));
+ outbuf[p++] = (char)(0x80 + (w & 0x3F));
+ } else {
+ // This is invalid. A trail surrogate should always follow a lead surrogate.
+ // Handling error by skipping
+ }
+ } else if (wc >= 0x800) {
outbuf[p++] = (char)(0xE0 + ((wc >> 12) & 0xF));
outbuf[p++] = (char)(0x80 + ((wc >> 6) & 0x3F));
outbuf[p++] = (char)(0x80 + (wc & 0x3F));
@@ -988,19 +1011,29 @@ static int is_ascii_string(char* s, int len)
uint16_t *plist_utf8_to_utf16(char *unistr, long size, long *items_read, long *items_written)
{
- uint16_t *outbuf = (uint16_t*)malloc((size+1)*sizeof(uint16_t));
+ uint16_t *outbuf = (uint16_t*)malloc(((size*2)+1)*sizeof(uint16_t));
int p = 0;
int i = 0;
unsigned char c0;
unsigned char c1;
unsigned char c2;
+ unsigned char c3;
+
+ uint32_t w;
while (i < size) {
c0 = unistr[i];
c1 = (i < size-1) ? unistr[i+1] : 0;
c2 = (i < size-2) ? unistr[i+2] : 0;
- if ((c0 >= 0xE0) && (i < size-2) && (c1 >= 0x80) && (c2 >= 0x80)) {
+ c3 = (i < size-3) ? unistr[i+3] : 0;
+ if ((c0 >= 0xF0) && (i < size-3) && (c1 >= 0x80) && (c2 >= 0x80) && (c3 >= 0x80)) {
+ // 4 byte sequence. Need to generate UTF-16 surrogate pair
+ w = ((((c0 & 7) << 18) + ((c1 & 0x3F) << 12) + ((c2 & 0x3F) << 6) + (c3 & 0x3F)) & 0x0FFFFF) - 0x010000;
+ outbuf[p++] = 0xD800 + (w >> 10);
+ outbuf[p++] = 0xDC00 + (w & 0x3FF);
+ i+=4;
+ } else if ((c0 >= 0xE0) && (i < size-2) && (c1 >= 0x80) && (c2 >= 0x80)) {
// 3 byte sequence
outbuf[p++] = ((c2 & 0x3F) + ((c1 & 3) << 6)) + (((c1 >> 2) & 15) << 8) + ((c0 & 15) << 12);
i+=3;