mirror of https://github.com/ipxe/ipxe.git
[utf8] Add ability to accumulate Unicode characters from UTF-8 bytes
Signed-off-by: Michael Brown <mcb30@ipxe.org>pull/631/head
parent
2acdc92994
commit
3cd3a73261
|
@ -0,0 +1,137 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU General Public License as
|
||||||
|
* published by the Free Software Foundation; either version 2 of the
|
||||||
|
* License, or any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful, but
|
||||||
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
* 02110-1301, USA.
|
||||||
|
*
|
||||||
|
* You can also choose to distribute this program under the terms of
|
||||||
|
* the Unmodified Binary Distribution Licence (as given in the file
|
||||||
|
* COPYING.UBDL), provided that you have satisfied its requirements.
|
||||||
|
*/
|
||||||
|
|
||||||
|
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <ipxe/utf8.h>
|
||||||
|
|
||||||
|
/** @file
|
||||||
|
*
|
||||||
|
* UTF-8 Unicode encoding
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accumulate Unicode character from UTF-8 byte sequence
|
||||||
|
*
|
||||||
|
* @v utf8 UTF-8 accumulator
|
||||||
|
* @v byte UTF-8 byte
|
||||||
|
* @ret character Unicode character, or 0 if incomplete
|
||||||
|
*/
|
||||||
|
unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
|
||||||
|
static unsigned int min[] = {
|
||||||
|
UTF8_MIN_TWO,
|
||||||
|
UTF8_MIN_THREE,
|
||||||
|
UTF8_MIN_FOUR,
|
||||||
|
};
|
||||||
|
unsigned int shift;
|
||||||
|
unsigned int len;
|
||||||
|
uint8_t tmp;
|
||||||
|
|
||||||
|
/* Handle continuation bytes */
|
||||||
|
if ( UTF8_IS_CONTINUATION ( byte ) ) {
|
||||||
|
|
||||||
|
/* Fail if this is an unexpected continuation byte */
|
||||||
|
if ( utf8->remaining == 0 ) {
|
||||||
|
DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
|
||||||
|
return UTF8_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Apply continuation byte */
|
||||||
|
utf8->character <<= UTF8_CONTINUATION_BITS;
|
||||||
|
utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
|
||||||
|
|
||||||
|
/* Return 0 if more continuation bytes are expected */
|
||||||
|
if ( --utf8->remaining != 0 )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Fail if sequence is illegal */
|
||||||
|
if ( utf8->character < utf8->min ) {
|
||||||
|
DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
|
||||||
|
utf8->character );
|
||||||
|
return UTF8_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sanity check */
|
||||||
|
assert ( utf8->character != 0 );
|
||||||
|
|
||||||
|
/* Return completed character */
|
||||||
|
DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
|
||||||
|
utf8, utf8->character );
|
||||||
|
return utf8->character;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Reset state and report failure if this is an unexpected
|
||||||
|
* non-continuation byte. Do not return UTF8_INVALID since
|
||||||
|
* doing so could cause us to drop a valid ASCII character.
|
||||||
|
*/
|
||||||
|
if ( utf8->remaining != 0 ) {
|
||||||
|
shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
|
||||||
|
DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
|
||||||
|
utf8, byte, ( utf8->character << shift ),
|
||||||
|
( ( 1 << shift ) - 1 ) );
|
||||||
|
utf8->remaining = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handle initial bytes */
|
||||||
|
if ( ! UTF8_IS_ASCII ( byte ) ) {
|
||||||
|
|
||||||
|
/* Sanity check */
|
||||||
|
assert ( utf8->remaining == 0 );
|
||||||
|
|
||||||
|
/* Count total number of bytes in sequence */
|
||||||
|
tmp = byte;
|
||||||
|
len = 0;
|
||||||
|
while ( tmp & UTF8_HIGH_BIT ) {
|
||||||
|
tmp <<= 1;
|
||||||
|
len++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check for illegal length */
|
||||||
|
if ( len > UTF8_MAX_LEN ) {
|
||||||
|
DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
|
||||||
|
utf8, byte, len );
|
||||||
|
return UTF8_INVALID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Store initial bits of character */
|
||||||
|
utf8->character = ( tmp >> len );
|
||||||
|
|
||||||
|
/* Store number of bytes remaining */
|
||||||
|
len--;
|
||||||
|
utf8->remaining = len;
|
||||||
|
assert ( utf8->remaining > 0 );
|
||||||
|
|
||||||
|
/* Store minimum legal value */
|
||||||
|
utf8->min = min[ len - 1 ];
|
||||||
|
assert ( utf8->min > 0 );
|
||||||
|
|
||||||
|
/* Await continuation bytes */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Handle ASCII bytes */
|
||||||
|
return byte;
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
#ifndef _IPXE_UTF8_H
|
||||||
|
#define _IPXE_UTF8_H
|
||||||
|
|
||||||
|
/** @file
|
||||||
|
*
|
||||||
|
* UTF-8 Unicode encoding
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
/** Maximum length of UTF-8 sequence */
|
||||||
|
#define UTF8_MAX_LEN 4
|
||||||
|
|
||||||
|
/** Minimum legal value for two-byte UTF-8 sequence */
|
||||||
|
#define UTF8_MIN_TWO 0x80
|
||||||
|
|
||||||
|
/** Minimum legal value for three-byte UTF-8 sequence */
|
||||||
|
#define UTF8_MIN_THREE 0x800
|
||||||
|
|
||||||
|
/** Minimum legal value for four-byte UTF-8 sequence */
|
||||||
|
#define UTF8_MIN_FOUR 0x10000
|
||||||
|
|
||||||
|
/** High bit of UTF-8 bytes */
|
||||||
|
#define UTF8_HIGH_BIT 0x80
|
||||||
|
|
||||||
|
/** Number of data bits in each continuation byte */
|
||||||
|
#define UTF8_CONTINUATION_BITS 6
|
||||||
|
|
||||||
|
/** Bit mask for data bits in a continuation byte */
|
||||||
|
#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
|
||||||
|
|
||||||
|
/** Non-data bits in a continuation byte */
|
||||||
|
#define UTF8_CONTINUATION 0x80
|
||||||
|
|
||||||
|
/** Check for a continuation byte
|
||||||
|
*
|
||||||
|
* @v byte UTF-8 byte
|
||||||
|
* @ret is_continuation Byte is a continuation byte
|
||||||
|
*/
|
||||||
|
#define UTF8_IS_CONTINUATION( byte ) \
|
||||||
|
( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )
|
||||||
|
|
||||||
|
/** Check for an ASCII byte
|
||||||
|
*
|
||||||
|
* @v byte UTF-8 byte
|
||||||
|
* @ret is_ascii Byte is an ASCII byte
|
||||||
|
*/
|
||||||
|
#define UTF8_IS_ASCII( byte ) ( ! ( (byte) & UTF8_HIGH_BIT ) )
|
||||||
|
|
||||||
|
/** Invalid character returned when decoding fails */
|
||||||
|
#define UTF8_INVALID 0xfffd
|
||||||
|
|
||||||
|
/** A UTF-8 character accumulator */
|
||||||
|
struct utf8_accumulator {
|
||||||
|
/** Character in progress */
|
||||||
|
unsigned int character;
|
||||||
|
/** Number of remaining continuation bytes */
|
||||||
|
unsigned int remaining;
|
||||||
|
/** Minimum legal character */
|
||||||
|
unsigned int min;
|
||||||
|
};
|
||||||
|
|
||||||
|
extern unsigned int utf8_accumulate ( struct utf8_accumulator *utf8,
|
||||||
|
uint8_t byte );
|
||||||
|
|
||||||
|
#endif /* _IPXE_UTF8_H */
|
Loading…
Reference in New Issue