diff options
author | darylm503 <darylm503@6f19259b-4bc3-4df7-8a09-765794883524> | 2012-10-05 22:00:43 +0000 |
---|---|---|
committer | darylm503 <darylm503@6f19259b-4bc3-4df7-8a09-765794883524> | 2012-10-05 22:00:43 +0000 |
commit | c42c9cac8c289acc7af256e4578b116cbb576054 (patch) | |
tree | 506cd5ccae6ccda1ec2574003acb54e969ec63b1 /StdLib/LibC/Locale | |
parent | e2a013fa404b095dbf4240fd122d6420021f848e (diff) | |
download | edk2-c42c9cac8c289acc7af256e4578b116cbb576054.tar.gz edk2-c42c9cac8c289acc7af256e4578b116cbb576054.tar.bz2 edk2-c42c9cac8c289acc7af256e4578b116cbb576054.zip |
StdLib: Fix several problems where characters were not being correctly converted between wide and MBCS.
Add utility functions for determining character length of strings.
Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: daryl.mcdaniel@intel.com
Reviewed-by: erik.c.bjorge@intel.com
Reviewed-by: lee.g.rosenbaum@intel.com
StdLib/LibC/
Locale/multibyte_Utf8.c
Improve comments.
Define implementation-specific MBCS utility functions, as declared in <stdlib.h>.
Enhance functionality of EncodeUtf8() and improve error handling.
Set correct conversion state in wcrtomb().
Bug fixes in wcsrtombs().
Make wctob() properly MBCS compliant.
Main/Main.c
Remove code obsoleted by new wcsrtombs() implementation.
git-svn-id: https://edk2.svn.sourceforge.net/svnroot/edk2/trunk/edk2@13785 6f19259b-4bc3-4df7-8a09-765794883524
Diffstat (limited to 'StdLib/LibC/Locale')
-rw-r--r-- | StdLib/LibC/Locale/multibyte_Utf8.c | 443 |
1 files changed, 310 insertions, 133 deletions
diff --git a/StdLib/LibC/Locale/multibyte_Utf8.c b/StdLib/LibC/Locale/multibyte_Utf8.c index 3f29f2942a..36e2cb379e 100644 --- a/StdLib/LibC/Locale/multibyte_Utf8.c +++ b/StdLib/LibC/Locale/multibyte_Utf8.c @@ -15,9 +15,9 @@ #include <wchar.h>
#include <sys/types.h>
-typedef int ch_UCS4;
+typedef int ch_UCS4;
-static mbstate_t LocalConvState = {0};
+static mbstate_t LocalConvState = {0};
/** Map a UTF-8 encoded prefix byte to a sequence length.
Zero means illegal prefix, but valid surrogate if < 0xC0.
@@ -59,12 +59,12 @@ UINT8 utf8_code_length[256] = { /** Process one byte of a multibyte character.
- @param ch
- @param ps
+ @param[in] ch One byte of a multibyte character.
+ @param[in,out] ps Pointer to a conversion state object.
- @retval -2
- @retval -1
- @retval 1:4
+ @retval -2 ch is an incomplete but potentially valid character.
+ @retval -1 ch is not valid in this context.
+ @retval 1:4 The length, in bytes, of the character ch just completed.
**/
static
int
@@ -174,10 +174,10 @@ ProcessOneByte(unsigned char ch, mbstate_t *ps) /** Convert one Multibyte sequence.
- @param Dest
- @param Src
- @param Len
- @param pS
+ @param[out] Dest Pointer to output location, or NULL
+ @param[in] Src Multibyte Source (UTF8)
+ @param[in] Len Max Number of bytes to convert
+ @param[in] pS Pointer to State struct., or NULL
@retval -2 Bytes processed comprise an incomplete, but potentially valid, character.
@retval -1 An encoding error was encountered. ps->E indicates the number of bytes consumed.
@@ -219,87 +219,212 @@ DecodeOneStateful( return NumConv;
}
-/** Convert wide characters (UTF16) into multibyte characters (UTF8)
+/* Determine the number of bytes needed to represent a Wide character
+ as a MBCS character.
+
+ A single wide character may convert into a one, two, three, or four byte
+ narrow (MBCS or UTF-8) character. The number of MBCS bytes can be determined
+ as follows.
+
+ If WCS char < 0x00000080 One Byte
+ Else if WCS char < 0x0000D800 Two Bytes
+ Else Three Bytes
+
+ Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+ Four-byte characters are not supported.
+
+ @param[in] InCh Wide character to test.
+
+ @retval -1 Improperly formed character
+ @retval 0 InCh is 0x0000
+ @retval >0 Number of bytes needed for the MBCS character
+*/
+int
+EFIAPI
+OneWcToMcLen(const wchar_t InCh)
+{
+ ssize_t NumBytes;
+
+ if(InCh == 0) { // Is this a NUL, 0x0000 ?
+ NumBytes = 0;
+ }
+ else if(InCh < 0x0080) { // Is this a 1-byte character?
+ NumBytes = 1;
+ }
+ else if(InCh < 0x0800) { // Is this a 2-byte character?
+ NumBytes = 2;
+ }
+ else if((InCh >= 0xD800) && (InCh < 0xE000)) { // Is this a surrogate?
+ NumBytes = -1;
+ }
+ else {
+ NumBytes = 3; // Otherwise, it must be a 3-byte character.
+ }
+ return (int)NumBytes; // Return extimate of required bytes.
+}
+
+/* Determine the number of bytes needed to represent a Wide character string
+ as a MBCS string of given maximum length. Will optionally return the number
+ of wide characters that would be consumed.
+
+ A single wide character may convert into a one, two, three, or four byte
+ narrow (MBCS or UTF-8) character. The number of MBCS bytes can be determined
+ as follows.
+
+ If WCS char < 0x00000080 One Byte
+ Else if WCS char < 0x00000800 Two Bytes
+ Else if WCS char < 0x00010000 Three Bytes
+ Else Four Bytes
+
+ Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+ Four-byte characters should not be encountered.
+
+ @param[in] Src Pointer to a wide character string.
+ @param[in] Limit Maximum number of bytes the converted string may occupy.
+ @param[out] NumChar Pointer to where to store the number of wide characters, or NULL.
+
+ @return The number of bytes required to convert Src to MBCS,
+ not including the terminating NUL. If NumChar is not NULL, the number
+ of characters represented by the return value will be written to
+ where it points.
+*/
+size_t
+EFIAPI
+EstimateWtoM(const wchar_t * Src, size_t Limit, size_t *NumChar)
+{
+ ssize_t Estimate;
+ size_t CharCount;
+ ssize_t NumBytes;
+ wchar_t EChar;
+
+ Estimate = 0;
+ CharCount = 0;
+ EChar = *Src++; // Get the initial character and point to next
+ while(((NumBytes = OneWcToMcLen(EChar)) > 0) &&
+ ((size_t)(Estimate + NumBytes) < Limit))
+ { // Until one of the source characters is NUL
+ ++CharCount; // Count this character.
+ Estimate += NumBytes; // Count the Bytes for this character
+ EChar = *Src++; // Get the next source character and point to the next.
+ }
+ if(NumChar != NULL) {
+ *NumChar = CharCount;
+ }
+ return (size_t)Estimate; // Return esimate of required bytes.
+}
+
+/* Determine the number of characters in a MBCS string.
+ MBCS characters are one to four bytes long. By examining the first byte
+ of a MBCS character, one can determine the number of bytes comprising the
+ character.
+
+ 0x00 - 0x7F One
+ 0xC0 - 0xDF Two
+ 0xE0 - 0xEF Three
+ 0xF0 - 0xF7 Four
+
+ Since UEFI only supports the Unicode Base Multilingual Plane (BMP),
+ Four-byte characters should not be encountered.
+
+ @param[in] Src The string to examine
+
+ @return The number of characters represented by the MBCS string.
+**/
+size_t
+EFIAPI
+CountMbcsChars(const char *Src)
+{
+ size_t Count;
+ char EChar;
+
+ Count = 0;
+ EChar = *Src++;
+ while(EChar != 0) {
+ if(EChar < 0x80) {
+ ++Count;
+ }
+ else if(EChar < 0xE0) {
+ Count += 2;
+ ++Src;
+ }
+ else if(EChar < 0xF0) {
+ Count += 3;
+ Src += 2;
+ }
+ else {
+ // Ill-formed character
+ break;
+ }
+ }
+ return Count;
+}
+
+/** Convert a wide character (UTF16) into a multibyte character (UTF8)
+
+ Converts a wide character into a corresponding multibyte character that
+ begins in the conversion state described by the object pointed to by ps.
+ If dst is not a null pointer, the converted character is then stored into
+ the array pointed to by dst.
+
+ It is the caller's responsibility to ensure that Dest is large enough to
+ hold the resulting MBCS sequence.
@param s Pointer to the wide-character string to convert
- @param size Number of wide characters in s. size <= wcslen(s);
+ @param Dest Pointer to the buffer in which to place the converted sequence, or NULL.
- @return A newly allocated buffer containing the converted string is returned,
- or NULL if an error occurred. Global variable errno contains more
- information if NULL is returned.
+ @retval -1 An error occurred. The error reason is in errno.
+ @retval >=0 The number of bytes stored into Dest.
**/
ssize_t
-EncodeUtf8(char *Dest, wchar_t *s, ssize_t size)
+EncodeUtf8(char *Dest, wchar_t ch)
{
char *p; /* next free byte in build buffer */
- char *v; /* next free byte in destination */
- ssize_t nneeded; /* number of result bytes needed */
- int i; /* index into s of next input byte */
int NumInBuff; // number of bytes in Buff
char Buff[4]; // Buffer into which each character is built
- assert(s != NULL);
- assert(size >= 0);
-
- v = Dest;
- nneeded = 0;
- if((size * MB_LEN_MAX) / MB_LEN_MAX != size) {
- // size is too large and resulted in overflow when multiplied by MB_LEN_MAX
- errno = EINVAL;
- return (ssize_t)-1;
- }
-
- for (i = 0; i < size;) {
- ch_UCS4 ch = s[i++];
p = Buff;
- if (ch < 0x80) {
- /* Encode ASCII -- One Byte */
- *p++ = (char) ch;
- }
- else if (ch < 0x0800) {
- /* Encode Latin-1 -- Two Byte */
- *p++ = (char)(0xc0 | (ch >> 6));
- *p++ = (char)(0x80 | (ch & 0x3f));
- }
- else {
+ NumInBuff = 0;
+ if (ch < 0x80) {
+ /* Encode ASCII -- One Byte */
+ *p++ = (char) ch;
+ NumInBuff = 1;
+ }
+ else if (ch < 0x0800) {
+ /* Encode Latin-1 -- Two Byte */
+ *p++ = (char)(0xc0 | (ch >> 6));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ NumInBuff = 2;
+ }
+ else {
/* Encode UCS2 Unicode ordinals -- Three Byte */
- /* Special case: check for high surrogate -- Shouldn't happen in UEFI */
- if (0xD800 <= ch && ch <= 0xDBFF && i < size) {
- ch_UCS4 ch2 = s[i];
- /* Check for low surrogate and combine the two to
- form a UCS4 value */
- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
- ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
- i++;
- /* Encode UCS4 Unicode ordinals -- Four Byte */
- *p++ = (char)(0xf0 | (ch >> 18));
- *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
- *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
- *p++ = (char)(0x80 | (ch & 0x3f));
- continue;
- }
- /* Fall through: handles isolated high surrogates */
+ /* Special case: check for surrogate -- Shouldn't happen in UEFI */
+ if (0xD800 <= ch && ch < 0xE000) {
+ errno = EILSEQ;
+ return -1;
}
+ else {
*p++ = (char)(0xe0 | (ch >> 12));
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
+ NumInBuff = 3;
}
- /* At this point, Buff holds the converted character which is NumInBuff bytes long.
- NumInBuff is the value 1, 2, 3, or 4
- */
- NumInBuff = (int)(p - Buff); // Number of bytes in Buff
- if(Dest != NULL) { // Save character if Dest is not NULL
- memcpy(v, Buff, NumInBuff);
- v += NumInBuff;
- }
- nneeded += NumInBuff; // Keep track of the number of bytes put into Dest
}
- if(Dest != NULL) {
- // Terminate the destination string.
- *v = '\0';
+ /* At this point, Buff holds the converted character which is NumInBuff bytes long.
+ NumInBuff is the value 1, 2, 3, or 4
+ */
+ if(Dest != NULL) { // Save character if Dest is not NULL
+ memcpy(Dest, Buff, NumInBuff);
+
+ if(ch != 0) {
+ // Terminate the destination string.
+ Dest[NumInBuff] = '\0';
+ }
+ else {
+ NumInBuff = 0;
+ }
}
- return nneeded; // Tell the caller
+ return NumInBuff; // Tell the caller
}
// ######################## Narrow to Wide Conversions #######################
@@ -307,6 +432,8 @@ EncodeUtf8(char *Dest, wchar_t *s, ssize_t size) /** If ps is not a null pointer, the mbsinit function determines whether the
pointed-to mbstate_t object describes an initial conversion state.
+ @param[in] ps Pointer to the conversion state object to test.
+
@return The mbsinit function returns nonzero if ps is a null pointer
or if the pointed-to object describes an initial conversion
state; otherwise, it returns zero.
@@ -329,8 +456,14 @@ mbsinit(const mbstate_t *ps) where internal is the mbstate_t object for the mbrlen function, except that
the expression designated by ps is evaluated only once.
- @return The mbrlen function returns a value between zero and n,
- inclusive, (size_t)(-2), or (size_t)(-1).
+ @param[in] s Pointer to a multibyte character sequence.
+ @param[in] n Maximum number of bytes to examine.
+ @param[in] pS Pointer to the conversion state object.
+
+ @retval 0 The next n or fewer characters complete a NUL.
+ @retval 1..n The number of bytes that complete the multibyte character.
+ @retval -2 The next n bytes contribute to an incomplete (but potentially valid) multibyte character.
+ @retval -1 An encoding error occurred.
Declared in: wchar.h
**/
@@ -338,10 +471,10 @@ size_t mbrlen(
const char *s,
size_t n,
- mbstate_t *ps
+ mbstate_t *pS
)
{
- return mbrtowc(NULL, s, n, ps);
+ return mbrtowc(NULL, s, n, pS);
}
/** Determine the number of bytes comprising a multibyte character.
@@ -392,6 +525,11 @@ corresponding wide character and then, if pwc is not a null pointer, stores that the object pointed to by pwc. If the corresponding wide character is the null wide
character, the resulting state described is the initial conversion state.
+ @param[out] pwc Pointer to where the resulting wide character is to be stored.
+ @param[in] s Pointer to a multibyte character "string".
+ @param[in] n The maximum number of bytes to inspect.
+ @param[in] ps Pointer to a conversion state object.
+
@retval 0 if the next n or fewer bytes complete the multibyte
character that corresponds to the null wide
character (which is the value stored).
@@ -480,6 +618,11 @@ just past the last multibyte character converted (if any). If conversion stopped reaching a terminating null character and if dst is not a null pointer, the resulting state
described is the initial conversion state.
+ @param[out] dst Pointer to where the resulting wide character sequence is stored.
+ @param[in] src Pointer to a pointer to the multibyte character sequence to convert.
+ @param[in] len Maximum number of wide characters to be stored into dst.
+ @param[in] ps Pointer to a conversion state object.
+
@return If the input conversion encounters a sequence of bytes that do
not form a valid multibyte character, an encoding error occurs:
the mbsrtowcs function stores the value of the macro EILSEQ in
@@ -564,21 +707,23 @@ mbsrtowcs( **/
size_t
mbstowcs(
- wchar_t *pwcs,
- const char *s,
- size_t n
+ wchar_t *Dest,
+ const char *Src,
+ size_t Limit
)
{
- /* pwcs may be NULL */
- /* s may be NULL */
+ /* Dest may be NULL */
+ /* Src may be NULL */
- return mbsrtowcs(pwcs, &s, n, NULL);
+ return mbsrtowcs(Dest, &Src, Limit, NULL);
}
/** The btowc function determines whether C constitutes a valid single-byte
character in the initial shift state.
+ @param[in] C A narrow character to test or convert to wide.
+
@return The btowc function returns WEOF if c has the value EOF or if
(unsigned char)C does not constitute a valid single-byte
character in the initial shift state. Otherwise, it returns the
@@ -621,6 +766,12 @@ array whose first element is pointed to by S. At most MB_CUR_MAX bytes are store wc is a null wide character, a null byte is stored, preceded by any shift sequence needed
to restore the initial shift state; the resulting state described is the initial conversion state.
+ @param[out] Dest Pointer to the location in which to store the resulting
+ multibyte character. Otherwise, NULL to reset the
+ conversion state.
+ @param[in] wchar The wide character to convert.
+ @param[in,out] pS Pointer to a conversion state object, or NULL.
+
@return The wcrtomb function returns the number of bytes stored in the
array object (including any shift sequences). When wc is not a
valid wide character, an encoding error occurs: the function
@@ -631,26 +782,31 @@ to restore the initial shift state; the resulting state described is the initial **/
size_t
wcrtomb(
- char *s,
+ char *Dest,
wchar_t wchar,
- mbstate_t *ps
+ mbstate_t *pS
)
{
size_t RetVal;
- /* s may be NULL */
- if (s == NULL) {
+ /* Dest may be NULL */
+ if (Dest == NULL) {
RetVal = 1;
}
else {
if (wchar == L'\0') {
- *s = '\0';
+ *Dest = '\0';
RetVal = 1;
}
else {
- RetVal = EncodeUtf8(s, &wchar, 1);
+ RetVal = EncodeUtf8(Dest, wchar);
}
}
+ if(pS == NULL) {
+ pS = &LocalConvState;
+ }
+ pS->A = 0; // Set ps to the initial conversion state
+
return RetVal;
}
@@ -698,27 +854,31 @@ wctomb( }
/** The wcsrtombs function converts a sequence of wide characters from the array
- indirectly pointed to by S into a sequence of corresponding multibyte
+ indirectly pointed to by Dest into a sequence of corresponding multibyte
characters that begins in the conversion state described by the object
pointed to by ps.
- If S is not a null pointer, the converted characters
- are then stored into the array pointed to by S. Conversion continues
- up to and including a terminating null wide character, which is also
- stored. Conversion stops earlier in two cases: when a wide character is
- reached that does not correspond to a valid multibyte character, or
- (if S is not a null pointer) when the next multibyte character would
- exceed the limit of N total bytes to be stored into the array pointed
- to by S. Each conversion takes place as if by a call to the wcrtomb
- function.)
-
- If S is not a null pointer, the pointer object pointed to by pwcs is
+ If Dest is not a null pointer, the converted characters are stored into the
+ array pointed to by Dest. Conversion continues up to and including a
+ terminating null wide character, which is also stored. Conversion stops
+ earlier in two cases: when a wide character is reached that does not
+ correspond to a valid multibyte character, or (if Dest is not a null
+ pointer) when the next multibyte character would exceed the limit of Limit
+ total bytes to be stored into the array pointed to by Dest. Each conversion
+ takes place as if by a call to the wcrtomb function.)
+
+ If Dest is not a null pointer, the pointer object pointed to by Src is
assigned either a null pointer (if conversion stopped due to reaching
a terminating null wide character) or the address just past the last wide
character converted (if any). If conversion stopped due to reaching a
terminating null wide character, the resulting state described is the
initial conversion state.
+ @param[in] Dest
+ @param[in,out] Src
+ @param[in] Limit Max number of bytes to store in Dest.
+ @param[in,out] ps
+
@return If conversion stops because a wide character is reached that
does not correspond to a valid multibyte character, an
encoding error occurs: the wcsrtombs function stores the
@@ -731,38 +891,50 @@ wctomb( **/
size_t
wcsrtombs(
- char *s,
- const wchar_t **pwcs,
- size_t n,
- mbstate_t *ps
+ char *Dest,
+ const wchar_t **Src,
+ size_t Limit,
+ mbstate_t *ps
)
{
- int count = 0;
+ size_t NumStored;
+ ssize_t MaxBytes;
+ int count;
+ wchar_t InCh;
- /* s may be NULL */
- /* pwcs may be NULL */
+ NumStored = 0;
+ MaxBytes = (ssize_t)Limit;
+
+ /* Dest may be NULL */
+ /* Src may be NULL */
/* ps appears to be unused */
- if (pwcs == NULL || *pwcs == NULL)
+ if (Src == NULL || *Src == NULL)
return (0);
- if (s == NULL) {
- while (*(*pwcs)++ != 0)
- count++;
- return(count);
+ if (Dest == NULL) {
+ NumStored = EstimateWtoM(*Src, MaxBytes, NULL);
}
-
- if (n != 0) {
- do {
- if ((*s++ = (char) *(*pwcs)++) == 0) {
- *pwcs = NULL;
+ else {
+ while (OneWcToMcLen(InCh = *(*Src)++) <= MaxBytes) {
+ if(InCh == 0) {
+ *Src = NULL;
break;
}
- count++;
- } while (--n != 0);
+ count = (int)wcrtomb(Dest, InCh, NULL);
+ if(count >= 0) {
+ Dest += count;
+ MaxBytes -= count;
+ NumStored += count;
+ }
+ else {
+ NumStored = (size_t)(-1);
+ }
+ }
}
- return count;
+
+ return NumStored;
}
/** Convert a wide-character string into a multibyte character string.
@@ -794,19 +966,23 @@ wcsrtombs( **/
size_t
wcstombs(
- char *s,
- const wchar_t *pwcs,
- size_t n
+ char *Dest,
+ const wchar_t *Src,
+ size_t Limit
)
{
- /* s may be NULL */
- return wcsrtombs(s, &pwcs, n, NULL);
+ /* Dest may be NULL */
+ return wcsrtombs(Dest, &Src, Limit, NULL);
}
/** The wctob function determines whether C corresponds to a member of the extended
character set whose multibyte character representation is a single byte when in the initial
shift state.
+ wctob needs to be consistent with wcrtomb.
+ If wcrtomb says that a character is representable in 1 byte,
+ then wctob needs to also represent the character as 1 byte.
+
@return The wctob function returns EOF if C does not correspond to a multibyte
character with length one in the initial shift state. Otherwise, it
returns the single-byte representation of that character as an
@@ -817,13 +993,14 @@ wcstombs( int
wctob(wint_t c)
{
- /* wctob needs to be consistent with wcrtomb.
- if wcrtomb says that a character is representable in 1 byte,
- which this implementation always says, then wctob needs to
- also represent the character as 1 byte.
- */
- if (c == WEOF) {
- return EOF;
+ int RetVal;
+
+ RetVal = EOF;
+ if(c == 0) {
+ RetVal = 0;
+ }
+ else if (OneWcToMcLen((const wchar_t)c) == 1) {
+ RetVal = (int)(c & 0xFF);
}
- return (int)(c & 0xFF);
+ return RetVal;
}
|