/** @file UCS2 to UTF8 manipulation library. Copyright (c) 2018 - 2019, Intel Corporation. All rights reserved.
(C) Copyright 2020 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent **/ #include #include #include #include #include #include /** Since each UCS2 character can be represented by 1-3 UTF8 encoded characters, this function is used to retrieve the UTF8 encoding size for a UCS2 character. @param[in] Utf8Buffer The buffer for UTF8 encoded data. @retval Return the size of UTF8 encoding string or 0 if it is not for UCS2 format. **/ UINT8 GetUTF8SizeForUCS2 ( IN CHAR8 *Utf8Buffer ) { CHAR8 TempChar; UINT8 Utf8Size; ASSERT (Utf8Buffer != NULL); TempChar = *Utf8Buffer; if ((TempChar & 0xF0) == 0xF0) { // // This format is not for UCS2. // return 0; } Utf8Size = 1; if ((TempChar & 0x80) == 0x80) { if ((TempChar & 0xC0) == 0xC0) { Utf8Size++; if ((TempChar & 0xE0) == 0xE0) { Utf8Size++; } } } return Utf8Size; } /** Since each UCS2 character can be represented by the format: \uXXXX, this function is used to retrieve the UCS2 character from a Unicode format. Call MUST make sure there are at least 6 Bytes in the input UTF8 buffer. @param[in] Utf8Buffer The buffer for UTF8 encoded data. @param[out] Ucs2Char The converted UCS2 character. @retval EFI_INVALID_PARAMETER Non-Ascii characters found in the hexadecimal digits string, and can't be converted to a UCS2 character. @retval EFI_SUCCESS The UCS2 character has been retrieved. **/ EFI_STATUS GetUCS2CharByFormat ( IN CHAR8 *Utf8Buffer, OUT CHAR16 *Ucs2Char ) { UINT8 Num1; UINT8 Num2; UINT8 Index; CHAR8 Ucs2CharFormat[UNICODE_FORMAT_CHAR_SIZE]; /// two Hexadecimal digits Ascii string, like "3F" for (Index = 0; Index < 4; Index++) { if ((*(Utf8Buffer + 2 + Index) & 0x80) != 0x00) { return EFI_INVALID_PARAMETER; } } ZeroMem (Ucs2CharFormat, UNICODE_FORMAT_CHAR_SIZE); // // Get the First Number, Offset is 2 // CopyMem (Ucs2CharFormat, Utf8Buffer + 2, UNICODE_FORMAT_CHAR_LEN); Num1 = (UINT8)AsciiStrHexToUintn (Ucs2CharFormat); // // Get the Second Number, Offset is 4 // CopyMem (Ucs2CharFormat, Utf8Buffer + 4, UNICODE_FORMAT_CHAR_LEN); Num2 = (UINT8)AsciiStrHexToUintn (Ucs2CharFormat); // // Ucs2Char is Little-Endian // *((CHAR8 *)Ucs2Char) = Num2; *(((CHAR8 *)Ucs2Char) + 1) = Num1; return EFI_SUCCESS; } /** Convert a UCS2 character to UTF8 encoding string. @param[in] Ucs2Char The provided UCS2 character. @param[out] Utf8Buffer The converted UTF8 encoded data. @retval Return the size of UTF8 encoding data for this UCS2 character. **/ UINT8 UCS2CharToUTF8 ( IN CHAR16 Ucs2Char, OUT CHAR8 *Utf8Buffer ) { UINT16 Ucs2Number; ASSERT (Utf8Buffer != NULL); Ucs2Number = (UINT16)Ucs2Char; if (Ucs2Number <= 0x007F) { // // UTF8 format: 0xxxxxxx // *Utf8Buffer = Ucs2Char & 0x7F; return 1; } else if ((Ucs2Number >= 0x0080) && (Ucs2Number <= 0x07FF)) { // // UTF8 format: 110xxxxx 10xxxxxx // *(Utf8Buffer + 1) = (Ucs2Char & 0x3F) | 0x80; *Utf8Buffer = ((Ucs2Char >> 6) & 0x1F) | 0xC0; return 2; } else { /// Ucs2Number >= 0x0800 && Ucs2Number <= 0xFFFF // // UTF8 format: 1110xxxx 10xxxxxx 10xxxxxx // *(Utf8Buffer + 2) = (Ucs2Char & 0x3F) | 0x80; *(Utf8Buffer + 1) = ((Ucs2Char >> 6) & 0x3F) | 0x80; *Utf8Buffer = ((Ucs2Char >> 12) & 0x0F) | 0xE0; return 3; } } /** Convert a UTF8 encoded data to a UCS2 character. @param[in] Utf8Buffer The provided UTF8 encoded data. @param[out] Ucs2Char The converted UCS2 character. @retval EFI_INVALID_PARAMETER The UTF8 encoded string is not valid or not for UCS2 character. @retval EFI_SUCCESS The converted UCS2 character. **/ EFI_STATUS UTF8ToUCS2Char ( IN CHAR8 *Utf8Buffer, OUT CHAR16 *Ucs2Char ) { UINT8 Utf8Size; CHAR8 *Ucs2Buffer; CHAR8 TempChar1; CHAR8 TempChar2; CHAR8 TempChar3; ASSERT (Utf8Buffer != NULL && Ucs2Char != NULL); ZeroMem (Ucs2Char, sizeof (CHAR16)); Ucs2Buffer = (CHAR8 *)Ucs2Char; Utf8Size = GetUTF8SizeForUCS2 (Utf8Buffer); switch (Utf8Size) { case 1: // // UTF8 format: 0xxxxxxx // TempChar1 = *Utf8Buffer; if ((TempChar1 & 0x80) != 0x00) { return EFI_INVALID_PARAMETER; } *Ucs2Buffer = TempChar1; *(Ucs2Buffer + 1) = 0; break; case 2: // // UTF8 format: 110xxxxx 10xxxxxx // TempChar1 = *Utf8Buffer; if ((TempChar1 & 0xE0) != 0xC0) { return EFI_INVALID_PARAMETER; } TempChar2 = *(Utf8Buffer + 1); if ((TempChar2 & 0xC0) != 0x80) { return EFI_INVALID_PARAMETER; } *Ucs2Buffer = (TempChar1 << 6) + (TempChar2 & 0x3F); *(Ucs2Buffer + 1) = (TempChar1 >> 2) & 0x07; break; case 3: // // UTF8 format: 1110xxxx 10xxxxxx 10xxxxxx // TempChar1 = *Utf8Buffer; if ((TempChar1 & 0xF0) != 0xE0) { return EFI_INVALID_PARAMETER; } TempChar2 = *(Utf8Buffer + 1); if ((TempChar2 & 0xC0) != 0x80) { return EFI_INVALID_PARAMETER; } TempChar3 = *(Utf8Buffer + 2); if ((TempChar3 & 0xC0) != 0x80) { return EFI_INVALID_PARAMETER; } *Ucs2Buffer = (TempChar2 << 6) + (TempChar3 & 0x3F); *(Ucs2Buffer + 1) = (TempChar1 << 4) + ((TempChar2 >> 2) & 0x0F); break; default: return EFI_INVALID_PARAMETER; } return EFI_SUCCESS; } /** Convert a UCS2 string to a UTF8 encoded string. @param[in] Ucs2Str The provided UCS2 string. @param[out] Utf8StrAddr The converted UTF8 string address. Caller is responsible for Free this string. @retval EFI_INVALID_PARAMETER One or more parameters are invalid. @retval EFI_OUT_OF_RESOURCES System runs out of resources. @retval EFI_SUCCESS The UTF8 encoded string has been converted. **/ EFI_STATUS UCS2StrToUTF8 ( IN CHAR16 *Ucs2Str, OUT CHAR8 **Utf8StrAddr ) { UINTN Ucs2StrIndex; UINTN Ucs2StrLength; CHAR8 *Utf8Str; UINTN Utf8StrLength; UINTN Utf8StrIndex; CHAR8 Utf8Buffer[UTF8_BUFFER_FOR_UCS2_MAX_SIZE]; UINT8 Utf8BufferSize; if ((Ucs2Str == NULL) || (Utf8StrAddr == NULL)) { return EFI_INVALID_PARAMETER; } Ucs2StrLength = StrLen (Ucs2Str); Utf8StrLength = 0; for (Ucs2StrIndex = 0; Ucs2StrIndex < Ucs2StrLength; Ucs2StrIndex++) { ZeroMem (Utf8Buffer, sizeof (Utf8Buffer)); Utf8BufferSize = UCS2CharToUTF8 (Ucs2Str[Ucs2StrIndex], Utf8Buffer); Utf8StrLength += Utf8BufferSize; } Utf8Str = AllocateZeroPool (Utf8StrLength + 1); if (Utf8Str == NULL) { return EFI_OUT_OF_RESOURCES; } Utf8StrIndex = 0; for (Ucs2StrIndex = 0; Ucs2StrIndex < Ucs2StrLength; Ucs2StrIndex++) { ZeroMem (Utf8Buffer, sizeof (Utf8Buffer)); Utf8BufferSize = UCS2CharToUTF8 (Ucs2Str[Ucs2StrIndex], Utf8Buffer); CopyMem (Utf8Str + Utf8StrIndex, Utf8Buffer, Utf8BufferSize); Utf8StrIndex += Utf8BufferSize; } Utf8Str[Utf8StrIndex] = '\0'; *Utf8StrAddr = Utf8Str; return EFI_SUCCESS; } /** Convert a UTF8 encoded string to a UCS2 string. @param[in] Utf8Str The provided UTF8 encoded string. @param[out] Ucs2StrAddr The converted UCS2 string address. Caller is responsible for Free this string. @retval EFI_INVALID_PARAMETER The UTF8 encoded string is not valid to convert to UCS2 string. One or more parameters are invalid. @retval EFI_OUT_OF_RESOURCES System runs out of resources. @retval EFI_SUCCESS The UCS2 string has been converted. **/ EFI_STATUS UTF8StrToUCS2 ( IN CHAR8 *Utf8Str, OUT CHAR16 **Ucs2StrAddr ) { EFI_STATUS Status; UINTN Utf8StrIndex; UINTN Utf8StrLength; UINTN Ucs2StrIndex; UINT8 Utf8BufferSize; CHAR16 *Ucs2StrTemp; if ((Utf8Str == NULL) || (Ucs2StrAddr == NULL)) { return EFI_INVALID_PARAMETER; } // // It is not an Ascii string, calculate string length. // Utf8StrLength = 0; while (*(Utf8Str + Utf8StrLength) != '\0') { Utf8StrLength++; } // // UCS2 string shall not be longer than the UTF8 string. // Ucs2StrTemp = AllocateZeroPool ((Utf8StrLength + 1) * sizeof (CHAR16)); if (Ucs2StrTemp == NULL) { return EFI_OUT_OF_RESOURCES; } Utf8StrIndex = 0; Ucs2StrIndex = 0; while (Utf8Str[Utf8StrIndex] != '\0') { if ((CompareMem (Utf8Str + Utf8StrIndex, "\\u", 2) == 0) && (Utf8StrLength - Utf8StrIndex >= UNICODE_FORMAT_LEN)) { Status = GetUCS2CharByFormat (Utf8Str + Utf8StrIndex, Ucs2StrTemp + Ucs2StrIndex); if (!EFI_ERROR (Status)) { Utf8StrIndex += UNICODE_FORMAT_LEN; Ucs2StrIndex++; } else { StrCpyS (Ucs2StrTemp + Ucs2StrIndex, 3, L"\\u"); Ucs2StrIndex += 2; Utf8StrIndex += 2; } } else { Utf8BufferSize = GetUTF8SizeForUCS2 (Utf8Str + Utf8StrIndex); if ((Utf8BufferSize == 0) || (Utf8StrLength - Utf8StrIndex < Utf8BufferSize)) { FreePool (Ucs2StrTemp); return EFI_INVALID_PARAMETER; } Status = UTF8ToUCS2Char (Utf8Str + Utf8StrIndex, Ucs2StrTemp + Ucs2StrIndex); if (EFI_ERROR (Status)) { FreePool (Ucs2StrTemp); return EFI_INVALID_PARAMETER; } Ucs2StrIndex++; Utf8StrIndex += Utf8BufferSize; } } *Ucs2StrAddr = AllocateZeroPool ((Ucs2StrIndex + 1) * sizeof (CHAR16)); if (*Ucs2StrAddr == NULL) { FreePool (Ucs2StrTemp); return EFI_OUT_OF_RESOURCES; } StrCpyS (*Ucs2StrAddr, Ucs2StrIndex + 1, Ucs2StrTemp); *(*Ucs2StrAddr + Ucs2StrIndex) = L'\0'; FreePool (Ucs2StrTemp); return EFI_SUCCESS; }