Line data Source code
1 : /* Copyright (C) 2017 Wildfire Games.
2 : *
3 : * Permission is hereby granted, free of charge, to any person obtaining
4 : * a copy of this software and associated documentation files (the
5 : * "Software"), to deal in the Software without restriction, including
6 : * without limitation the rights to use, copy, modify, merge, publish,
7 : * distribute, sublicense, and/or sell copies of the Software, and to
8 : * permit persons to whom the Software is furnished to do so, subject to
9 : * the following conditions:
10 : *
11 : * The above copyright notice and this permission notice shall be included
12 : * in all copies or substantial portions of the Software.
13 : *
14 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 : */
22 :
23 : #include "precompiled.h"
24 :
25 : #include "lib/utf8.h"
26 :
27 : static const StatusDefinition utf8StatusDefinitions[] = {
28 : { ERR::UTF8_SURROGATE, L"UTF-16 surrogate pairs aren't supported" },
29 : { ERR::UTF8_OUTSIDE_BMP, L"Code point outside BMP (> 0x10000)" },
30 : { ERR::UTF8_NONCHARACTER, L"Noncharacter (e.g. WEOF)" },
31 : { ERR::UTF8_INVALID_UTF8, L"Invalid UTF-8 sequence" }
32 : };
33 1 : STATUS_ADD_DEFINITIONS(utf8StatusDefinitions);
34 :
35 :
36 : // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
37 : // which bears the following notice:
38 : /*
39 : * Copyright 2001-2004 Unicode, Inc.
40 : *
41 : * Disclaimer
42 : *
43 : * This source code is provided as is by Unicode, Inc. No claims are
44 : * made as to fitness for any particular purpose. No warranties of any
45 : * kind are expressed or implied. The recipient agrees to determine
46 : * applicability of information provided. If this file has been
47 : * purchased on magnetic or optical media from Unicode, Inc., the
48 : * sole remedy for any claim will be exchange of defective media
49 : * within 90 days of receipt.
50 : *
51 : * Limitations on Rights to Redistribute This Code
52 : *
53 : * Unicode, Inc. hereby grants the right to freely use the information
54 : * supplied in this file in the creation of products supporting the
55 : * Unicode Standard, and to make copies of this file in any form
56 : * for internal or external distribution as long as this notice
57 : * remains attached.
58 : */
59 :
60 : // design rationale:
61 : // - to cope with wchar_t differences between VC (UTF-16) and
62 : // GCC (UCS-4), we only allow codepoints in the BMP.
63 : // encoded UTF-8 sequences are therefore no longer than 3 bytes.
64 : // - surrogates are disabled because variable-length strings
65 : // violate the purpose of using wchar_t instead of UTF-8.
66 : // - replacing disallowed characters instead of aborting outright
67 : // avoids overly inconveniencing users and eases debugging.
68 :
69 : // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
70 :
71 : // (must be unsigned to avoid sign extension)
72 : typedef u8 UTF8;
73 : typedef u32 UTF32;
74 :
75 :
76 : // called from ReplaceIfInvalid and UTF8Codec::Decode
77 16 : static UTF32 RaiseError(Status err, Status* perr)
78 : {
79 16 : if(perr) // caller wants return code, not warning dialog
80 : {
81 16 : if(*perr == INFO::OK) // only return the first error (see header)
82 8 : *perr = err;
83 : }
84 : else
85 : {
86 : wchar_t error[200];
87 0 : debug_printf("UTF8 error: %s\n", utf8_from_wstring(StatusDescription(err, error, ARRAY_SIZE(error))).c_str());
88 : }
89 :
90 16 : return 0xFFFDul; // replacement character
91 : }
92 :
93 :
94 266981 : static UTF32 ReplaceIfInvalid(UTF32 u, Status* err)
95 : {
96 : // disallow surrogates
97 266981 : if(0xD800ul <= u && u <= 0xDFFFul)
98 0 : return RaiseError(ERR::UTF8_SURROGATE, err);
99 : // outside BMP (UTF-16 representation would require surrogates)
100 266981 : if(u > 0xFFFFul)
101 0 : return RaiseError(ERR::UTF8_OUTSIDE_BMP, err);
102 : // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
103 266981 : if(u == 0xFFFEul || u == 0xFFFFul || (0xFDD0ul <= u && u <= 0xFDEFul))
104 0 : return RaiseError(ERR::UTF8_NONCHARACTER, err);
105 266981 : return u;
106 : }
107 :
108 :
109 : class UTF8Codec
110 : {
111 : public:
112 22757 : static void Encode(UTF32 u, UTF8*& dstPos)
113 : {
114 22757 : switch (Size(u))
115 : {
116 22708 : case 1:
117 22708 : *dstPos++ = UTF8(u);
118 22708 : break;
119 34 : case 2:
120 34 : *dstPos++ = UTF8((u >> 6) | 0xC0);
121 34 : *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
122 34 : break;
123 15 : case 3:
124 15 : *dstPos++ = UTF8((u >> 12) | 0xE0);
125 15 : *dstPos++ = UTF8(((u >> 6) | 0x80u) & 0xBFu);
126 15 : *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
127 15 : break;
128 : }
129 22757 : }
130 :
131 : // @return decoded scalar, or replacementCharacter on error
132 244224 : static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd, Status* err)
133 : {
134 244224 : const size_t size = SizeFromFirstByte(*srcPos);
135 244224 : if(!IsValid(srcPos, size, srcEnd))
136 : {
137 16 : srcPos += 1; // only skip the offending byte (increases chances of resynchronization)
138 16 : return RaiseError(ERR::UTF8_INVALID_UTF8, err);
139 : }
140 :
141 244208 : UTF32 u = 0;
142 244334 : for(size_t i = 0; i < size-1; i++)
143 : {
144 126 : u += UTF32(*srcPos++);
145 126 : u <<= 6;
146 : }
147 244208 : u += UTF32(*srcPos++);
148 :
149 : static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
150 244208 : u -= offsets[size];
151 244208 : return u;
152 : }
153 :
154 : private:
155 22757 : static inline size_t Size(UTF32 u)
156 : {
157 22757 : if(u < 0x80)
158 22708 : return 1;
159 49 : if(u < 0x800)
160 34 : return 2;
161 : // ReplaceIfInvalid ensures > 3 byte encodings are never used.
162 15 : return 3;
163 : }
164 :
165 244224 : static inline size_t SizeFromFirstByte(UTF8 firstByte)
166 : {
167 244224 : if(firstByte < 0xC0)
168 244135 : return 1;
169 89 : if(firstByte < 0xE0)
170 36 : return 2;
171 53 : if(firstByte < 0xF0)
172 51 : return 3;
173 : // IsValid rejects firstByte values that would cause > 4 byte encodings.
174 2 : return 4;
175 : }
176 :
177 : // c.f. Unicode 3.1 Table 3-7
178 : // @param size obtained via SizeFromFirstByte (our caller also uses it)
179 244224 : static bool IsValid(const UTF8* const src, size_t size, const UTF8* const srcEnd)
180 : {
181 244224 : if(src+size > srcEnd) // not enough data
182 4 : return false;
183 :
184 244220 : if(src[0] < 0x80)
185 244127 : return true;
186 93 : if(!(0xC2 <= src[0] && src[0] <= 0xF4))
187 10 : return false;
188 :
189 : // special cases (stricter than the loop)
190 83 : if(src[0] == 0xE0 && src[1] < 0xA0)
191 0 : return false;
192 83 : if(src[0] == 0xED && src[1] > 0x9F)
193 0 : return false;
194 83 : if(src[0] == 0xF0 && src[1] < 0x90)
195 0 : return false;
196 83 : if(src[0] == 0xF4 && src[1] > 0x8F)
197 0 : return false;
198 :
199 211 : for(size_t i = 1; i < size; i++)
200 : {
201 130 : if(!(0x80 <= src[i] && src[i] <= 0xBF))
202 2 : return false;
203 : }
204 :
205 81 : return true;
206 : }
207 : };
208 :
209 :
210 : //-----------------------------------------------------------------------------
211 :
212 1827 : std::string utf8_from_wstring(const std::wstring& src, Status* err)
213 : {
214 1827 : if(err)
215 1780 : *err = INFO::OK;
216 :
217 1827 : std::string dst(src.size()*3+1, ' '); // see UTF8Codec::Size; +1 ensures &dst[0] is valid
218 1827 : UTF8* dstPos = (UTF8*)&dst[0];
219 24584 : for(size_t i = 0; i < src.size(); i++)
220 : {
221 22757 : const UTF32 u = ReplaceIfInvalid(UTF32(src[i]), err);
222 22757 : UTF8Codec::Encode(u, dstPos);
223 : }
224 1827 : dst.resize(dstPos - (UTF8*)&dst[0]);
225 1827 : return dst;
226 : }
227 :
228 :
229 3848 : std::wstring wstring_from_utf8(const std::string& src, Status* err)
230 : {
231 3848 : if(err)
232 2881 : *err = INFO::OK;
233 :
234 3848 : std::wstring dst;
235 3848 : dst.reserve(src.size());
236 3848 : const UTF8* srcPos = (const UTF8*)src.data();
237 3848 : const UTF8* const srcEnd = srcPos + src.size();
238 492296 : while(srcPos < srcEnd)
239 : {
240 244224 : const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd, err);
241 244224 : dst.push_back((wchar_t)ReplaceIfInvalid(u, err));
242 : }
243 3848 : return dst;
244 3 : }
|