LCOV - code coverage report
Current view: top level - source/lib - utf8.cpp (source / functions) Hit Total Coverage
Test: 0 A.D. test coverage report Lines: 74 79 93.7 %
Date: 2022-03-08 13:03:03 Functions: 7 7 100.0 %

          Line data    Source code
       1             : /* Copyright (C) 2017 Wildfire Games.
       2             :  *
       3             :  * Permission is hereby granted, free of charge, to any person obtaining
       4             :  * a copy of this software and associated documentation files (the
       5             :  * "Software"), to deal in the Software without restriction, including
       6             :  * without limitation the rights to use, copy, modify, merge, publish,
       7             :  * distribute, sublicense, and/or sell copies of the Software, and to
       8             :  * permit persons to whom the Software is furnished to do so, subject to
       9             :  * the following conditions:
      10             :  *
      11             :  * The above copyright notice and this permission notice shall be included
      12             :  * in all copies or substantial portions of the Software.
      13             :  *
      14             :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
      15             :  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
      16             :  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
      17             :  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
      18             :  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
      19             :  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      20             :  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
      21             :  */
      22             : 
      23             : #include "precompiled.h"
      24             : 
      25             : #include "lib/utf8.h"
      26             : 
      27             : static const StatusDefinition utf8StatusDefinitions[] = {
      28             :     { ERR::UTF8_SURROGATE, L"UTF-16 surrogate pairs aren't supported" },
      29             :     { ERR::UTF8_OUTSIDE_BMP, L"Code point outside BMP (> 0x10000)" },
      30             :     { ERR::UTF8_NONCHARACTER, L"Noncharacter (e.g. WEOF)" },
      31             :     { ERR::UTF8_INVALID_UTF8, L"Invalid UTF-8 sequence" }
      32             : };
      33             : STATUS_ADD_DEFINITIONS(utf8StatusDefinitions);
      34             : 
      35             : 
      36             : // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
      37             : // which bears the following notice:
      38             : /*
      39             : * Copyright 2001-2004 Unicode, Inc.
      40             : *
      41             : * Disclaimer
      42             : *
      43             : * This source code is provided as is by Unicode, Inc. No claims are
      44             : * made as to fitness for any particular purpose. No warranties of any
      45             : * kind are expressed or implied. The recipient agrees to determine
      46             : * applicability of information provided. If this file has been
      47             : * purchased on magnetic or optical media from Unicode, Inc., the
      48             : * sole remedy for any claim will be exchange of defective media
      49             : * within 90 days of receipt.
      50             : *
      51             : * Limitations on Rights to Redistribute This Code
      52             : *
      53             : * Unicode, Inc. hereby grants the right to freely use the information
      54             : * supplied in this file in the creation of products supporting the
      55             : * Unicode Standard, and to make copies of this file in any form
      56             : * for internal or external distribution as long as this notice
      57             : * remains attached.
      58             : */
      59             : 
      60             : // design rationale:
      61             : // - to cope with wchar_t differences between VC (UTF-16) and
      62             : //   GCC (UCS-4), we only allow codepoints in the BMP.
      63             : //   encoded UTF-8 sequences are therefore no longer than 3 bytes.
      64             : // - surrogates are disabled because variable-length strings
      65             : //   violate the purpose of using wchar_t instead of UTF-8.
      66             : // - replacing disallowed characters instead of aborting outright
      67             : //   avoids overly inconveniencing users and eases debugging.
      68             : 
      69             : // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
      70             : 
      71             : // (must be unsigned to avoid sign extension)
      72             : typedef u8 UTF8;
      73             : typedef u32 UTF32;
      74             : 
      75             : 
      76             : // called from ReplaceIfInvalid and UTF8Codec::Decode
      77          16 : static UTF32 RaiseError(Status err, Status* perr)
      78             : {
      79          16 :     if(perr)    // caller wants return code, not warning dialog
      80             :     {
      81          16 :         if(*perr == INFO::OK)   // only return the first error (see header)
      82           8 :             *perr = err;
      83             :     }
      84             :     else
      85             :     {
      86           0 :         wchar_t error[200];
      87           0 :         debug_printf("UTF8 error: %s\n", utf8_from_wstring(StatusDescription(err, error, ARRAY_SIZE(error))).c_str());
      88             :     }
      89             : 
      90          16 :     return 0xFFFDul;    // replacement character
      91             : }
      92             : 
      93             : 
      94      257758 : static UTF32 ReplaceIfInvalid(UTF32 u, Status* err)
      95             : {
      96             :     // disallow surrogates
      97      257758 :     if(0xD800ul <= u && u <= 0xDFFFul)
      98           0 :         return RaiseError(ERR::UTF8_SURROGATE, err);
      99             :     // outside BMP (UTF-16 representation would require surrogates)
     100      257758 :     if(u > 0xFFFFul)
     101           0 :         return RaiseError(ERR::UTF8_OUTSIDE_BMP, err);
     102             :     // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
     103      257758 :     if(u == 0xFFFEul || u == 0xFFFFul || (0xFDD0ul <= u && u <= 0xFDEFul))
     104           0 :         return RaiseError(ERR::UTF8_NONCHARACTER, err);
     105             :     return u;
     106             : }
     107             : 
     108             : 
     109             : class UTF8Codec
     110             : {
     111             : public:
     112       21483 :     static void Encode(UTF32 u, UTF8*& dstPos)
     113             :     {
     114       21483 :         switch (Size(u))
     115             :         {
     116       21434 :         case 1:
     117       21434 :             *dstPos++ = UTF8(u);
     118       21434 :             break;
     119          34 :         case 2:
     120          34 :             *dstPos++ = UTF8((u >> 6) | 0xC0);
     121          34 :             *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
     122          34 :             break;
     123          15 :         case 3:
     124          15 :             *dstPos++ = UTF8((u >> 12) | 0xE0);
     125          15 :             *dstPos++ = UTF8(((u >> 6) | 0x80u) & 0xBFu);
     126          15 :             *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
     127          15 :             break;
     128             :         }
     129       21483 :     }
     130             : 
     131             :     // @return decoded scalar, or replacementCharacter on error
     132      236275 :     static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd, Status* err)
     133             :     {
     134      472550 :         const size_t size = SizeFromFirstByte(*srcPos);
     135      236275 :         if(!IsValid(srcPos, size, srcEnd))
     136             :         {
     137          16 :             srcPos += 1;    // only skip the offending byte (increases chances of resynchronization)
     138          16 :             return RaiseError(ERR::UTF8_INVALID_UTF8, err);
     139             :         }
     140             : 
     141             :         UTF32 u = 0;
     142      236373 :         for(size_t i = 0; i < size-1; i++)
     143             :         {
     144         114 :             u += UTF32(*srcPos++);
     145         114 :             u <<= 6;
     146             :         }
     147      236259 :         u += UTF32(*srcPos++);
     148             : 
     149      236259 :         static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
     150      236259 :         u -= offsets[size];
     151      236259 :         return u;
     152             :     }
     153             : 
     154             : private:
     155             :     static inline size_t Size(UTF32 u)
     156             :     {
     157       21483 :         if(u < 0x80)
     158             :             return 1;
     159          49 :         if(u < 0x800)
     160          34 :             return 2;
     161             :         // ReplaceIfInvalid ensures > 3 byte encodings are never used.
     162             :         return 3;
     163             :     }
     164             : 
     165             :     static inline size_t SizeFromFirstByte(UTF8 firstByte)
     166             :     {
     167      236275 :         if(firstByte < 0xC0)
     168             :             return 1;
     169          83 :         if(firstByte < 0xE0)
     170             :             return 2;
     171          47 :         if(firstByte < 0xF0)
     172          45 :             return 3;
     173             :         // IsValid rejects firstByte values that would cause > 4 byte encodings.
     174             :         return 4;
     175             :     }
     176             : 
     177             :     // c.f. Unicode 3.1 Table 3-7
     178             :     // @param size obtained via SizeFromFirstByte (our caller also uses it)
     179      236275 :     static bool IsValid(const UTF8* const src, size_t size, const UTF8* const srcEnd)
     180             :     {
     181      236275 :         if(src+size > srcEnd)    // not enough data
     182             :             return false;
     183             : 
     184      236271 :         if(src[0] < 0x80)
     185             :             return true;
     186          87 :         if(!(0xC2 <= src[0] && src[0] <= 0xF4))
     187             :             return false;
     188             : 
     189             :         // special cases (stricter than the loop)
     190          77 :         if(src[0] == 0xE0 && src[1] < 0xA0)
     191             :             return false;
     192          77 :         if(src[0] == 0xED && src[1] > 0x9F)
     193             :             return false;
     194          77 :         if(src[0] == 0xF0 && src[1] < 0x90)
     195             :             return false;
     196          77 :         if(src[0] == 0xF4 && src[1] > 0x8F)
     197             :             return false;
     198             : 
     199         193 :         for(size_t i = 1; i < size; i++)
     200             :         {
     201         118 :             if(!(0x80 <= src[i] && src[i] <= 0xBF))
     202             :                 return false;
     203             :         }
     204             : 
     205             :         return true;
     206             :     }
     207             : };
     208             : 
     209             : 
     210             : //-----------------------------------------------------------------------------
     211             : 
     212        1650 : std::string utf8_from_wstring(const std::wstring& src, Status* err)
     213             : {
     214        1650 :     if(err)
     215        1609 :         *err = INFO::OK;
     216             : 
     217        3300 :     std::string dst(src.size()*3+1, ' ');   // see UTF8Codec::Size; +1 ensures &dst[0] is valid
     218        1650 :     UTF8* dstPos = (UTF8*)&dst[0];
     219       23133 :     for(size_t i = 0; i < src.size(); i++)
     220             :     {
     221       42966 :         const UTF32 u = ReplaceIfInvalid(UTF32(src[i]), err);
     222       21483 :         UTF8Codec::Encode(u, dstPos);
     223             :     }
     224        3300 :     dst.resize(dstPos - (UTF8*)&dst[0]);
     225        1650 :     return dst;
     226             : }
     227             : 
     228             : 
     229        3594 : std::wstring wstring_from_utf8(const std::string& src, Status* err)
     230             : {
     231        3594 :     if(err)
     232        2687 :         *err = INFO::OK;
     233             : 
     234        3594 :     std::wstring dst;
     235        3594 :     dst.reserve(src.size());
     236        3594 :     const UTF8* srcPos = (const UTF8*)src.data();
     237        3594 :     const UTF8* const srcEnd = srcPos + src.size();
     238      239869 :     while(srcPos < srcEnd)
     239             :     {
     240      236275 :         const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd, err);
     241      236275 :         dst.push_back((wchar_t)ReplaceIfInvalid(u, err));
     242             :     }
     243        3594 :     return dst;
     244             : }

Generated by: LCOV version 1.13