From e91740f68ce6334935d9440c12f08e8c136d0b45 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Mon, 15 May 2006 12:44:44 +0000 Subject: Clean up the UTF-8 validation code. git-svn-id: file:///home/lennart/svn/public/pulseaudio/trunk@870 fefdeb5f-60dc-0310-8127-8f9354f1896f --- src/polypcore/utf8.c | 154 +++++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 78 deletions(-) (limited to 'src') diff --git a/src/polypcore/utf8.c b/src/polypcore/utf8.c index b7758439..a706b280 100644 --- a/src/polypcore/utf8.c +++ b/src/polypcore/utf8.c @@ -29,87 +29,85 @@ #endif #include +#include #include "utf8.h" -#define UNICODE_VALID(Char) \ - ((Char) < 0x110000 && \ - (((Char) & 0xFFFFF800) != 0xD800) && \ - ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ - ((Char) & 0xFFFE) != 0xFFFE) - - -#define CONTINUATION_CHAR \ - do { \ - if ((*(const unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ - goto error; \ - val <<= 6; \ - val |= (*(const unsigned char *)p) & 0x3f; \ - } while(0) - - -const char * -pa_utf8_valid (const char *str) - -{ - unsigned val = 0; - unsigned min = 0; - const char *p; - - for (p = str; *p; p++) - { - if (*(const unsigned char *)p < 128) - /* done */; - else - { - const char *last; - - last = p; - if ((*(const unsigned char *)p & 0xe0) == 0xc0) /* 110xxxxx */ - { - if ( ((*(const unsigned char *)p & 0x1e) == 0)) - goto error; - p++; - if ( ((*(const unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */ - goto error; - } - else - { - if ((*(const unsigned char *)p & 0xf0) == 0xe0) /* 1110xxxx */ - { - min = (1 << 11); - val = *(const unsigned char *)p & 0x0f; - goto TWO_REMAINING; - } - else if ((*(const unsigned char *)p & 0xf8) == 0xf0) /* 11110xxx */ - { - min = (1 << 16); - val = *(const unsigned char *)p & 0x07; - } - else - goto error; - - p++; - CONTINUATION_CHAR; - TWO_REMAINING: - p++; - CONTINUATION_CHAR; - p++; - CONTINUATION_CHAR; - - if ( (val < min)) - goto error; - - if ( (!UNICODE_VALID(val))) - goto error; - } - - continue; - - error: - return NULL; - } +static inline int is_unicode_valid(uint32_t ch) { + if (ch >= 0x110000) /* End of unicode space */ + return 0; + if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */ + return 0; + if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */ + return 0; + if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */ + return 0; + return 1; +} + +static inline int is_continuation_char(uint8_t ch) { + if ((ch & 0xc0) != 0x80) /* 10xxxxxx */ + return 0; + return 1; +} + +static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) { + *u_ch <<= 6; + *u_ch |= ch & 0x3f; +} + +const char* pa_utf8_valid (const char *str) { + uint32_t val = 0; + uint32_t min = 0; + const uint8_t *p, *last; + + for (p = (uint8_t*)str; *p; p++) { + if (*p < 128) + /* done */; + else { + last = p; + + if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */ + min = 128; + val = *p & 0x1e; + goto ONE_REMAINING; + } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/ + min = (1 << 11); + val = *p & 0x0f; + goto TWO_REMAINING; + } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */ + min = (1 << 16); + val = *p & 0x07; + } else + goto error; + + p++; + if (!is_continuation_char(*p)) + goto error; + merge_continuation_char(&val, *p); + +TWO_REMAINING: + p++; + if (!is_continuation_char(*p)) + goto error; + merge_continuation_char(&val, *p); + +ONE_REMAINING: + p++; + if (!is_continuation_char(*p)) + goto error; + merge_continuation_char(&val, *p); + + if (val < min) + goto error; + + if (!is_unicode_valid(val)) + goto error; + } } - return str; + return str; + +error: + return NULL; } -- cgit