LCOV - code coverage report
Current view: top level - source/lib - utf8.cpp (source / functions) Hit Total Coverage
Test: 0 A.D. test coverage report Lines: 89 97 91.8 %
Date: 2023-01-19 00:18:29 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /* Copyright (C) 2017 Wildfire Games.
       2             :  *
       3             :  * Permission is hereby granted, free of charge, to any person obtaining
       4             :  * a copy of this software and associated documentation files (the
       5             :  * "Software"), to deal in the Software without restriction, including
       6             :  * without limitation the rights to use, copy, modify, merge, publish,
       7             :  * distribute, sublicense, and/or sell copies of the Software, and to
       8             :  * permit persons to whom the Software is furnished to do so, subject to
       9             :  * the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included
      12             :  * in all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      15             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      16             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      17             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      19             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      20             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      21             :  */
      22             : 
      23             : #include "precompiled.h"
      24             : 
      25             : #include "lib/utf8.h"
      26             : 
      27             : static const StatusDefinition utf8StatusDefinitions[] = {
      28             :     { ERR::UTF8_SURROGATE, L"UTF-16 surrogate pairs aren't supported" },
      29             :     { ERR::UTF8_OUTSIDE_BMP, L"Code point outside BMP (> 0x10000)" },
      30             :     { ERR::UTF8_NONCHARACTER, L"Noncharacter (e.g. WEOF)" },
      31             :     { ERR::UTF8_INVALID_UTF8, L"Invalid UTF-8 sequence" }
      32             : };
      33           1 : STATUS_ADD_DEFINITIONS(utf8StatusDefinitions);
      34             : 
      35             : 
      36             : // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
      37             : // which bears the following notice:
      38             : /*
      39             : * Copyright 2001-2004 Unicode, Inc.
      40             : *
      41             : * Disclaimer
      42             : *
      43             : * This source code is provided as is by Unicode, Inc. No claims are
      44             : * made as to fitness for any particular purpose. No warranties of any
      45             : * kind are expressed or implied. The recipient agrees to determine
      46             : * applicability of information provided. If this file has been
      47             : * purchased on magnetic or optical media from Unicode, Inc., the
      48             : * sole remedy for any claim will be exchange of defective media
      49             : * within 90 days of receipt.
      50             : *
      51             : * Limitations on Rights to Redistribute This Code
      52             : *
      53             : * Unicode, Inc. hereby grants the right to freely use the information
      54             : * supplied in this file in the creation of products supporting the
      55             : * Unicode Standard, and to make copies of this file in any form
      56             : * for internal or external distribution as long as this notice
      57             : * remains attached.
      58             : */
      59             : 
      60             : // design rationale:
      61             : // - to cope with wchar_t differences between VC (UTF-16) and
      62             : //   GCC (UCS-4), we only allow codepoints in the BMP.
      63             : //   encoded UTF-8 sequences are therefore no longer than 3 bytes.
      64             : // - surrogates are disabled because variable-length strings
      65             : //   violate the purpose of using wchar_t instead of UTF-8.
      66             : // - replacing disallowed characters instead of aborting outright
      67             : //   avoids overly inconveniencing users and eases debugging.
      68             : 
      69             : // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
      70             : 
      71             : // (must be unsigned to avoid sign extension)
      72             : typedef u8 UTF8;
      73             : typedef u32 UTF32;
      74             : 
      75             : 
      76             : // called from ReplaceIfInvalid and UTF8Codec::Decode
      77          16 : static UTF32 RaiseError(Status err, Status* perr)
      78             : {
      79          16 :     if(perr)    // caller wants return code, not warning dialog
      80             :     {
      81          16 :         if(*perr == INFO::OK)   // only return the first error (see header)
      82           8 :             *perr = err;
      83             :     }
      84             :     else
      85             :     {
      86             :         wchar_t error[200];
      87           0 :         debug_printf("UTF8 error: %s\n", utf8_from_wstring(StatusDescription(err, error, ARRAY_SIZE(error))).c_str());
      88             :     }
      89             : 
      90          16 :     return 0xFFFDul;    // replacement character
      91             : }
      92             : 
      93             : 
      94      266981 : static UTF32 ReplaceIfInvalid(UTF32 u, Status* err)
      95             : {
      96             :     // disallow surrogates
      97      266981 :     if(0xD800ul <= u && u <= 0xDFFFul)
      98           0 :         return RaiseError(ERR::UTF8_SURROGATE, err);
      99             :     // outside BMP (UTF-16 representation would require surrogates)
     100      266981 :     if(u > 0xFFFFul)
     101           0 :         return RaiseError(ERR::UTF8_OUTSIDE_BMP, err);
     102             :     // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
     103      266981 :     if(u == 0xFFFEul || u == 0xFFFFul || (0xFDD0ul <= u && u <= 0xFDEFul))
     104           0 :         return RaiseError(ERR::UTF8_NONCHARACTER, err);
     105      266981 :     return u;
     106             : }
     107             : 
     108             : 
     109             : class UTF8Codec
     110             : {
     111             : public:
     112       22757 :     static void Encode(UTF32 u, UTF8*& dstPos)
     113             :     {
     114       22757 :         switch (Size(u))
     115             :         {
     116       22708 :         case 1:
     117       22708 :             *dstPos++ = UTF8(u);
     118       22708 :             break;
     119          34 :         case 2:
     120          34 :             *dstPos++ = UTF8((u >> 6) | 0xC0);
     121          34 :             *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
     122          34 :             break;
     123          15 :         case 3:
     124          15 :             *dstPos++ = UTF8((u >> 12) | 0xE0);
     125          15 :             *dstPos++ = UTF8(((u >> 6) | 0x80u) & 0xBFu);
     126          15 :             *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
     127          15 :             break;
     128             :         }
     129       22757 :     }
     130             : 
     131             :     // @return decoded scalar, or replacementCharacter on error
     132      244224 :     static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd, Status* err)
     133             :     {
     134      244224 :         const size_t size = SizeFromFirstByte(*srcPos);
     135      244224 :         if(!IsValid(srcPos, size, srcEnd))
     136             :         {
     137          16 :             srcPos += 1;    // only skip the offending byte (increases chances of resynchronization)
     138          16 :             return RaiseError(ERR::UTF8_INVALID_UTF8, err);
     139             :         }
     140             : 
     141      244208 :         UTF32 u = 0;
     142      244334 :         for(size_t i = 0; i < size-1; i++)
     143             :         {
     144         126 :             u += UTF32(*srcPos++);
     145         126 :             u <<= 6;
     146             :         }
     147      244208 :         u += UTF32(*srcPos++);
     148             : 
     149             :         static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
     150      244208 :         u -= offsets[size];
     151      244208 :         return u;
     152             :     }
     153             : 
     154             : private:
     155       22757 :     static inline size_t Size(UTF32 u)
     156             :     {
     157       22757 :         if(u < 0x80)
     158       22708 :             return 1;
     159          49 :         if(u < 0x800)
     160          34 :             return 2;
     161             :         // ReplaceIfInvalid ensures > 3 byte encodings are never used.
     162          15 :         return 3;
     163             :     }
     164             : 
     165      244224 :     static inline size_t SizeFromFirstByte(UTF8 firstByte)
     166             :     {
     167      244224 :         if(firstByte < 0xC0)
     168      244135 :             return 1;
     169          89 :         if(firstByte < 0xE0)
     170          36 :             return 2;
     171          53 :         if(firstByte < 0xF0)
     172          51 :             return 3;
     173             :         // IsValid rejects firstByte values that would cause > 4 byte encodings.
     174           2 :         return 4;
     175             :     }
     176             : 
     177             :     // c.f. Unicode 3.1 Table 3-7
     178             :     // @param size obtained via SizeFromFirstByte (our caller also uses it)
     179      244224 :     static bool IsValid(const UTF8* const src, size_t size, const UTF8* const srcEnd)
     180             :     {
     181      244224 :         if(src+size > srcEnd)    // not enough data
     182           4 :             return false;
     183             : 
     184      244220 :         if(src[0] < 0x80)
     185      244127 :             return true;
     186          93 :         if(!(0xC2 <= src[0] && src[0] <= 0xF4))
     187          10 :             return false;
     188             : 
     189             :         // special cases (stricter than the loop)
     190          83 :         if(src[0] == 0xE0 && src[1] < 0xA0)
     191           0 :             return false;
     192          83 :         if(src[0] == 0xED && src[1] > 0x9F)
     193           0 :             return false;
     194          83 :         if(src[0] == 0xF0 && src[1] < 0x90)
     195           0 :             return false;
     196          83 :         if(src[0] == 0xF4 && src[1] > 0x8F)
     197           0 :             return false;
     198             : 
     199         211 :         for(size_t i = 1; i < size; i++)
     200             :         {
     201         130 :             if(!(0x80 <= src[i] && src[i] <= 0xBF))
     202           2 :                 return false;
     203             :         }
     204             : 
     205          81 :         return true;
     206             :     }
     207             : };
     208             : 
     209             : 
     210             : //-----------------------------------------------------------------------------
     211             : 
     212        1827 : std::string utf8_from_wstring(const std::wstring& src, Status* err)
     213             : {
     214        1827 :     if(err)
     215        1780 :         *err = INFO::OK;
     216             : 
     217        1827 :     std::string dst(src.size()*3+1, ' ');   // see UTF8Codec::Size; +1 ensures &dst[0] is valid
     218        1827 :     UTF8* dstPos = (UTF8*)&dst[0];
     219       24584 :     for(size_t i = 0; i < src.size(); i++)
     220             :     {
     221       22757 :         const UTF32 u = ReplaceIfInvalid(UTF32(src[i]), err);
     222       22757 :         UTF8Codec::Encode(u, dstPos);
     223             :     }
     224        1827 :     dst.resize(dstPos - (UTF8*)&dst[0]);
     225        1827 :     return dst;
     226             : }
     227             : 
     228             : 
     229        3848 : std::wstring wstring_from_utf8(const std::string& src, Status* err)
     230             : {
     231        3848 :     if(err)
     232        2881 :         *err = INFO::OK;
     233             : 
     234        3848 :     std::wstring dst;
     235        3848 :     dst.reserve(src.size());
     236        3848 :     const UTF8* srcPos = (const UTF8*)src.data();
     237        3848 :     const UTF8* const srcEnd = srcPos + src.size();
     238      492296 :     while(srcPos < srcEnd)
     239             :     {
     240      244224 :         const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd, err);
     241      244224 :         dst.push_back((wchar_t)ReplaceIfInvalid(u, err));
     242             :     }
     243        3848 :     return dst;
     244           3 : }

Generated by: LCOV version 1.13