add built-in UTF8 support, use as default (Bernhard Kaindl, Jean-Pierre Andre)

master
szaka 2008-12-21 23:28:08 +00:00
parent f7a15b7041
commit 0017cc31df
1 changed files with 311 additions and 0 deletions

View File

@ -3,6 +3,8 @@
*
* Copyright (c) 2000-2004 Anton Altaparmakov
* Copyright (c) 2002-2008 Szabolcs Szakacsits
* Copyright (c) 2008 Jean-Pierre Andre
* Copyright (c) 2008 Bernhard Kaindl
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@ -39,6 +41,9 @@
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#include "attrib.h"
#include "types.h"
@ -47,6 +52,8 @@
#include "logging.h"
#include "misc.h"
#define NOREVBOM 0 /* JPA rejecting U+FFFE and U+FFFF, open to debate */
/*
* IMPORTANT
* =========
@ -55,6 +62,8 @@
* encoding inside the strings!!!
*/
static int use_utf8 = 1; /* use UTF-8 encoding for file names */
/*
* This is used by the name collation functions to quickly determine what
* characters are (in)valid.
@ -373,6 +382,282 @@ int ntfs_file_values_compare(const FILE_NAME_ATTR *file_name_attr1,
err_val, ic, upcase, upcase_len);
}
/*
NTFS uses Unicode (UTF-16LE [NTFS-3G uses UCS-2LE, which is enough
for now]) for path names, but the Unicode code points need to be
converted before a path can be accessed under NTFS. For 7 bit ASCII/ANSI,
glibc does this even without a locale in a hard-coded fashion as that
appears to be is easy because the low 7-bit ASCII range appears to be
available in all charsets but it does not convert anything if
there was some error with the locale setup or none set up like
when mount is called during early boot where he (by policy) do
not use locales (and may be not available if /usr is not yet mounted),
so this patch fixes the resulting issues for systems which use
UTF-8 and for others, specifying the locale in fstab brings them
the encoding which they want.
If no locale is defined or there was a problem with setting one
up and whenever nl_langinfo(CODESET) returns a sting starting with
"ANSI", use an internal UCS-2LE <-> UTF-8 codeset converter to fix
the bug where NTFS-3G does not show any path names which include
international characters!!! (and also fails on creating them) as result.
Author: Bernhard Kaindl <bk@suse.de>
Jean-Pierre Andre made it compliant with RFC3629/RFC2781.
*/
/*
* Return the amount of 8-bit elements in UTF-8 needed (without
* the terminating null) to store a given UTF-16LE string.
*/
static int utf16_to_utf8_size(const ntfschar *ins, const int ins_len, int outs_len)
{
int i;
int count = 0;
BOOL surrog;
surrog = FALSE;
for (i = 0; i < ins_len && ins[i]; i++) {
unsigned short c = le16_to_cpu(ins[i]);
if (surrog) {
if ((c >= 0xdc00) && (c < 0xe000)) {
surrog = FALSE;
count += 4;
} else goto fail;
} else
if (c < 0x80)
count++;
else if (c < 0x800)
count += 2;
else if (c < 0xd800)
count += 3;
else if (c < 0xdc00)
surrog = TRUE;
#if NOREVBOM
else if ((c >= 0xe000) && (c < 0xfffe))
#else
else if (c >= 0xe000)
#endif
count += 3;
else goto fail;
if (count > outs_len)
goto fail;
}
if (surrog) goto fail;
return count;
fail:
return -1;
}
/*
* ntfs_utf16_to_utf8 - convert a little endian UTF16LE string to an UTF-8 string
* @ins: input utf16 string buffer
* @ins_len: length of input string in utf16 characters
* @outs: on return contains the (allocated) output multibyte string
* @outs_len: length of output buffer in bytes
*/
static int ntfs_utf16_to_utf8(const ntfschar *ins, const int ins_len,
char **outs, int outs_len)
{
char *t;
int i, size;
ntfschar halfpair;
halfpair = 0;
if (!*outs)
outs_len = PATH_MAX;
size = utf16_to_utf8_size(ins, ins_len, outs_len);
if (size < 0) {
errno = ENAMETOOLONG;
goto fail;
}
if (!*outs)
*outs = ntfs_malloc((outs_len = size + 1));
t = *outs;
for (i = 0; i < ins_len && ins[i]; i++) {
unsigned short c = le16_to_cpu(ins[i]);
/* size not double-checked */
if (halfpair) {
if ((c >= 0xdc00) && (c < 0xe000)) {
*t++ = 0xf0 + (((halfpair + 64) >> 8) & 7);
*t++ = 0x80 + (((halfpair + 64) >> 2) & 63);
*t++ = 0x80 + ((c >> 6) & 15) + ((halfpair & 3) << 4);
*t++ = 0x80 + (c & 63);
halfpair = 0;
} else goto fail;
} else if (c < 0x80) {
*t++ = c;
} else {
if (c < 0x800) {
*t++ = (0xc0 | ((c >> 6) & 0x3f));
*t++ = 0x80 | (c & 0x3f);
} else if (c < 0xd800) {
*t++ = 0xe0 | (c >> 12);
*t++ = 0x80 | ((c >> 6) & 0x3f);
*t++ = 0x80 | (c & 0x3f);
} else if (c < 0xdc00)
halfpair = c;
else if (c >= 0xe000) {
*t++ = 0xe0 | (c >> 12);
*t++ = 0x80 | ((c >> 6) & 0x3f);
*t++ = 0x80 | (c & 0x3f);
} else goto fail;
}
}
*t = '\0';
return t - *outs;
fail:
return -1;
}
/*
* Return the amount of 16-bit elements in UTF-16LE needed
* (without the terminating null) to store given UTF-8 string.
*
* Return -1 if it does not fit into PATH_MAX.
*
* Note: This does not check whether the input sequence is a valid utf8 string,
* and should be used only in context where such check is made!
*/
static int utf8_to_utf16_size(const char *s)
{
unsigned int byte;
size_t count = 0;
while ((byte = *((const unsigned char *)s++))) {
if (++count >= PATH_MAX || byte >= 0xF5)
goto fail;
if (!*s) break;
if (byte >= 0xC0) s++;
if (!*s) break;
if (byte >= 0xE0) s++;
if (!*s) break;
if (byte >= 0xF0) {
s++;
if (++count >= PATH_MAX)
goto fail;
}
}
return count;
fail:
return -1;
}
/*
* This converts one UTF-8 sequence to cpu-endian Unicode value
* within range U+0 .. U+10ffff and excluding U+D800 .. U+DFFF
* Returns the number of used utf8 bytes or -1 if sequence is invalid.
*/
static int utf8_to_unicode(u32 *wc, const char *s)
{
unsigned int byte = *((const unsigned char *)s);
/* single byte */
if (byte == 0) {
*wc = (u32) 0;
return 0;
} else if (byte < 0x80) {
*wc = (u32) byte;
return 1;
/* double byte */
} else if (byte < 0xc2) {
goto fail;
} else if (byte < 0xE0) {
if (strlen(s) < 2)
goto fail;
if ((s[1] & 0xC0) == 0x80) {
*wc = ((u32)(byte & 0x1F) << 6)
| ((u32)(s[1] & 0x3F));
return 2;
} else
goto fail;
/* three-byte */
} else if (byte < 0xF0) {
if (strlen(s) < 3)
goto fail;
if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
*wc = ((u32)(byte & 0x0F) << 12)
| ((u32)(s[1] & 0x3F) << 6)
| ((u32)(s[2] & 0x3F));
/* Check valid ranges */
#if NOREVBOM
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|| ((*wc >= 0xe000) && (*wc <= 0xFFFD)))
return 3;
#else
if (((*wc >= 0x800) && (*wc <= 0xD7FF))
|| ((*wc >= 0xe000) && (*wc <= 0xFFFF)))
return 3;
#endif
}
goto fail;
/* four-byte */
} else if (byte < 0xF5) {
if (strlen(s) < 4)
goto fail;
if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)
&& ((s[3] & 0xC0) == 0x80)) {
*wc = ((u32)(byte & 0x07) << 18)
| ((u32)(s[1] & 0x3F) << 12)
| ((u32)(s[2] & 0x3F) << 6)
| ((u32)(s[3] & 0x3F));
/* Check valid ranges */
if ((*wc <= 0x10ffff) && (*wc >= 0x10000))
return 4;
}
goto fail;
}
fail:
return -1;
}
/**
* ntfs_utf8_to_utf16 - convert a UTF-8 string to a UTF-16LE string
* @ins: input multibyte string buffer
* @outs: on return contains the (allocated) output utf16 string
* @outs_len: length of output buffer in utf16 characters
*/
static int ntfs_utf8_to_utf16(const char *ins, ntfschar **outs)
{
const char *t = ins;
u32 wc;
ntfschar *outpos;
int shorts = utf8_to_utf16_size(ins);
if (shorts < 0) {
errno = EILSEQ;
goto fail;
}
if (!*outs)
*outs = ntfs_malloc((shorts+1) * sizeof(ntfschar));
outpos = *outs;
while(1) {
int m = utf8_to_unicode(&wc, t);
if (m < 0) {
errno = EILSEQ;
goto fail;
}
if (wc < 0x10000)
*outpos++ = cpu_to_le16(wc);
else {
wc -= 0x10000;
*outpos++ = cpu_to_le16((wc >> 10) + 0xd800);
*outpos++ = cpu_to_le16((wc & 0x3ff) + 0xdc00);
}
if (m == 0)
break;
t += m;
}
return --outpos - *outs;
fail:
return -1;
}
/**
* ntfs_ucstombs - convert a little endian Unicode string to a multibyte string
* @ins: input Unicode string buffer
@ -419,6 +704,8 @@ int ntfs_ucstombs(const ntfschar *ins, const int ins_len, char **outs,
errno = ENAMETOOLONG;
return -1;
}
if (use_utf8)
return ntfs_utf16_to_utf8(ins, ins_len, outs, outs_len);
if (!mbs) {
mbs_len = (ins_len + 1) * MB_CUR_MAX;
mbs = ntfs_malloc(mbs_len);
@ -525,6 +812,9 @@ int ntfs_mbstoucs(const char *ins, ntfschar **outs)
return -1;
}
if (use_utf8)
return ntfs_utf8_to_utf16(ins, outs);
/* Determine the size of the multi-byte string in bytes. */
ins_size = strlen(ins);
/* Determine the length of the multi-byte string. */
@ -734,3 +1024,24 @@ void ntfs_ucsfree(ntfschar *ucs)
free(ucs);
}
/*
* Define the character encoding to be used.
* Use UTF-8 unless specified otherwise.
*/
int ntfs_set_char_encoding(const char *locale)
{
use_utf8 = 0;
if (!locale || strstr(locale,"utf8") || strstr(locale,"UTF8")
|| strstr(locale,"utf-8") || strstr(locale,"UTF-8"))
use_utf8 = 1;
else
if (setlocale(LC_ALL, locale))
use_utf8 = 0;
else {
ntfs_log_error("Invalid locale, encoding to UTF-8\n");
use_utf8 = 1;
}
return 0; /* always successful */
}