diff --git a/libntfs-3g/unistr.c b/libntfs-3g/unistr.c index 7f278cd1..71802aa7 100644 --- a/libntfs-3g/unistr.c +++ b/libntfs-3g/unistr.c @@ -61,6 +61,11 @@ #define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */ +#ifndef ALLOW_BROKEN_SURROGATES +/* Erik allowing broken UTF-16 surrogate pairs by default, open to debate. */ +#define ALLOW_BROKEN_SURROGATES 1 +#endif /* !defined(ALLOW_BROKEN_SURROGATES) */ + /* * IMPORTANT * ========= @@ -462,8 +467,22 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l if ((c >= 0xdc00) && (c < 0xe000)) { surrog = FALSE; count += 4; - } else + } else { +#if ALLOW_BROKEN_SURROGATES + /* The first UTF-16 unit of a surrogate pair has + * a value between 0xd800 and 0xdc00. It can be + * encoded as an individual UTF-8 sequence if we + * cannot combine it with the next UTF-16 unit + * unit as a surrogate pair. */ + surrog = FALSE; + count += 3; + + --i; + continue; +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ + } } else if (c < 0x80) count++; @@ -473,6 +492,10 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l count += 3; else if (c < 0xdc00) surrog = TRUE; +#if ALLOW_BROKEN_SURROGATES + else if (c < 0xe000) + count += 3; +#endif /* ALLOW_BROKEN_SURROGATES */ #if NOREVBOM else if ((c >= 0xe000) && (c < 0xfffe)) #else @@ -487,7 +510,11 @@ static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_l } } if (surrog) +#if ALLOW_BROKEN_SURROGATES + count += 3; /* ending with a single surrogate */ +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ ret = count; out: @@ -548,8 +575,24 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, *t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4); *t++ = 0x80 + (c & 63); halfpair = 0; - } else + } else { +#if ALLOW_BROKEN_SURROGATES + /* The first UTF-16 unit of a surrogate pair has + * a value between 0xd800 and 0xdc00. It can be + * encoded as an individual UTF-8 sequence if we + * cannot combine it with the next UTF-16 unit + * unit as a surrogate pair. */ + *t++ = 0xe0 | (halfpair >> 12); + *t++ = 0x80 | ((halfpair >> 6) & 0x3f); + *t++ = 0x80 | (halfpair & 0x3f); + halfpair = 0; + + --i; + continue; +#else goto fail; +#endif /* ALLOW_BROKEN_SURROGATES */ + } } else if (c < 0x80) { *t++ = c; } else { @@ -562,6 +605,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, *t++ = 0x80 | (c & 0x3f); } else if (c < 0xdc00) halfpair = c; +#if ALLOW_BROKEN_SURROGATES + else if (c < 0xe000) { + *t++ = 0xe0 | (c >> 12); + *t++ = 0x80 | ((c >> 6) & 0x3f); + *t++ = 0x80 | (c & 0x3f); + } +#endif /* ALLOW_BROKEN_SURROGATES */ else if (c >= 0xe000) { *t++ = 0xe0 | (c >> 12); *t++ = 0x80 | ((c >> 6) & 0x3f); @@ -570,6 +620,13 @@ static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len, goto fail; } } +#if ALLOW_BROKEN_SURROGATES + if (halfpair) { /* ending with a single surrogate */ + *t++ = 0xe0 | (halfpair >> 12); + *t++ = 0x80 | ((halfpair >> 6) & 0x3f); + *t++ = 0x80 | (halfpair & 0x3f); + } +#endif /* ALLOW_BROKEN_SURROGATES */ *t = '\0'; #if defined(__APPLE__) || defined(__DARWIN__) @@ -693,10 +750,16 @@ static int utf8_to_unicode(u32 *wc, const char *s) /* Check valid ranges */ #if NOREVBOM if (((*wc >= 0x800) && (*wc <= 0xD7FF)) +#if ALLOW_BROKEN_SURROGATES + || ((*wc >= 0xD800) && (*wc <= 0xDFFF)) +#endif /* ALLOW_BROKEN_SURROGATES */ || ((*wc >= 0xe000) && (*wc <= 0xFFFD))) return 3; #else if (((*wc >= 0x800) && (*wc <= 0xD7FF)) +#if ALLOW_BROKEN_SURROGATES + || ((*wc >= 0xD800) && (*wc <= 0xDFFF)) +#endif /* ALLOW_BROKEN_SURROGATES */ || ((*wc >= 0xe000) && (*wc <= 0xFFFF))) return 3; #endif