Line data Source code
1 : // tinygettext - A gettext replacement that works directly on .po files
2 : // Copyright (c) 2009 Ingo Ruhnke <grumbel@gmail.com>
3 : //
4 : // This software is provided 'as-is', without any express or implied
5 : // warranty. In no event will the authors be held liable for any damages
6 : // arising from the use of this software.
7 : //
8 : // Permission is granted to anyone to use this software for any purpose,
9 : // including commercial applications, and to alter it and redistribute it
10 : // freely, subject to the following restrictions:
11 : //
12 : // 1. The origin of this software must not be misrepresented; you must not
13 : // claim that you wrote the original software. If you use this software
14 : // in a product, an acknowledgement in the product documentation would be
15 : // appreciated but is not required.
16 : // 2. Altered source versions must be plainly marked as such, and must not be
17 : // misrepresented as being the original software.
18 : // 3. This notice may not be removed or altered from any source distribution.
19 :
20 : #include "precompiled.h"
21 :
22 : #include "tinygettext/po_parser.hpp"
23 :
24 : #include <iostream>
25 : #include <ctype.h>
26 : #include <string>
27 : #include <istream>
28 : #include <string.h>
29 : #include <unordered_map>
30 : #include <stdlib.h>
31 :
32 : #include "tinygettext/language.hpp"
33 : #include "tinygettext/log_stream.hpp"
34 : #include "tinygettext/iconv.hpp"
35 : #include "tinygettext/dictionary.hpp"
36 : #include "tinygettext/plural_forms.hpp"
37 :
38 : namespace tinygettext {
39 :
40 : bool POParser::pedantic = true;
41 :
42 : void
43 0 : POParser::parse(const std::string& filename, std::istream& in, Dictionary& dict)
44 : {
45 0 : POParser parser(filename, in, dict);
46 0 : parser.parse();
47 0 : }
48 :
49 : class POParserError {};
50 :
51 0 : POParser::POParser(const std::string& filename_, std::istream& in_, Dictionary& dict_, bool use_fuzzy_) :
52 : filename(filename_),
53 : in(in_),
54 : dict(dict_),
55 : use_fuzzy(use_fuzzy_),
56 : running(false),
57 : eof(false),
58 : big5(false),
59 : line_number(0),
60 : current_line(),
61 0 : conv()
62 : {
63 0 : }
64 :
65 0 : POParser::~POParser()
66 : {
67 0 : }
68 :
69 : void
70 0 : POParser::warning(const std::string& msg)
71 : {
72 0 : log_warning << filename << ":" << line_number << ": warning: " << msg << ": " << current_line << std::endl;
73 : //log_warning << "Line: " << current_line << std::endl;
74 0 : }
75 :
76 : void
77 0 : POParser::error(const std::string& msg)
78 : {
79 0 : log_error << filename << ":" << line_number << ": error: " << msg << ": " << current_line << std::endl;
80 :
81 : // Try to recover from an error by searching for start of another entry
82 0 : do
83 0 : next_line();
84 0 : while(!eof && !is_empty_line());
85 :
86 0 : throw POParserError();
87 : }
88 :
89 : void
90 0 : POParser::next_line()
91 : {
92 0 : line_number += 1;
93 0 : if (!std::getline(in, current_line))
94 0 : eof = true;
95 0 : }
96 :
97 : void
98 0 : POParser::get_string_line(std::ostringstream& out, size_t skip)
99 : {
100 0 : if (skip+1 >= static_cast<unsigned int>(current_line.size()))
101 0 : error("unexpected end of line");
102 :
103 0 : if (current_line[skip] != '"')
104 0 : error("expected start of string '\"'");
105 :
106 : std::string::size_type i;
107 0 : for(i = skip+1; current_line[i] != '\"'; ++i)
108 : {
109 0 : if (big5 && static_cast<unsigned char>(current_line[i]) >= 0x81 && static_cast<unsigned char>(current_line[i]) <= 0xfe)
110 : {
111 0 : out << current_line[i];
112 :
113 0 : i += 1;
114 :
115 0 : if (i >= current_line.size())
116 0 : error("invalid big5 encoding");
117 :
118 0 : out << current_line[i];
119 : }
120 0 : else if (i >= current_line.size())
121 : {
122 0 : error("unexpected end of string");
123 : }
124 0 : else if (current_line[i] == '\\')
125 : {
126 0 : i += 1;
127 :
128 0 : if (i >= current_line.size())
129 0 : error("unexpected end of string in handling '\\'");
130 :
131 0 : switch (current_line[i])
132 : {
133 0 : case 'a': out << '\a'; break;
134 0 : case 'b': out << '\b'; break;
135 0 : case 'v': out << '\v'; break;
136 0 : case 'n': out << '\n'; break;
137 0 : case 't': out << '\t'; break;
138 0 : case 'r': out << '\r'; break;
139 0 : case '"': out << '"'; break;
140 0 : case '\\': out << '\\'; break;
141 0 : default:
142 0 : std::ostringstream err;
143 0 : err << "unhandled escape '\\" << current_line[i] << "'";
144 0 : warning(err.str());
145 :
146 0 : out << current_line[i-1] << current_line[i];
147 0 : break;
148 : }
149 : }
150 : else
151 : {
152 0 : out << current_line[i];
153 : }
154 : }
155 :
156 : // process trailing garbage in line and warn if there is any
157 0 : for(i = i+1; i < current_line.size(); ++i)
158 0 : if (!isspace(current_line[i]))
159 : {
160 0 : warning("unexpected garbage after string ignoren");
161 0 : break;
162 : }
163 0 : }
164 :
165 : std::string
166 0 : POParser::get_string(unsigned int skip)
167 : {
168 0 : std::ostringstream out;
169 :
170 0 : if (skip+1 >= static_cast<unsigned int>(current_line.size()))
171 0 : error("unexpected end of line");
172 :
173 0 : if (current_line[skip] == ' ' && current_line[skip+1] == '"')
174 : {
175 0 : get_string_line(out, skip+1);
176 : }
177 : else
178 : {
179 0 : if (pedantic)
180 0 : warning("keyword and string must be seperated by a single space");
181 :
182 : for(;;)
183 : {
184 0 : if (skip >= static_cast<unsigned int>(current_line.size()))
185 0 : error("unexpected end of line");
186 0 : else if (current_line[skip] == '\"')
187 : {
188 0 : get_string_line(out, skip);
189 0 : break;
190 : }
191 0 : else if (!isspace(current_line[skip]))
192 : {
193 0 : error("string must start with '\"'");
194 : }
195 : else
196 : {
197 : // skip space
198 : }
199 :
200 0 : skip += 1;
201 : }
202 : }
203 :
204 0 : next:
205 0 : next_line();
206 0 : for(std::string::size_type i = 0; i < current_line.size(); ++i)
207 : {
208 0 : if (current_line[i] == '"')
209 : {
210 0 : if (i == 1)
211 0 : if (pedantic)
212 0 : warning("leading whitespace before string");
213 :
214 0 : get_string_line(out, i);
215 0 : goto next;
216 : }
217 0 : else if (isspace(current_line[i]))
218 : {
219 : // skip
220 : }
221 : else
222 : {
223 0 : break;
224 : }
225 : }
226 :
227 0 : return out.str();
228 : }
229 :
230 0 : static bool has_prefix(const std::string& lhs, const std::string& rhs)
231 : {
232 0 : if (lhs.length() < rhs.length())
233 0 : return false;
234 : else
235 0 : return lhs.compare(0, rhs.length(), rhs) == 0;
236 : }
237 :
238 : void
239 0 : POParser::parse_header(const std::string& header)
240 : {
241 0 : std::string from_charset;
242 0 : std::string::size_type start = 0;
243 0 : for(std::string::size_type i = 0; i < header.length(); ++i)
244 : {
245 0 : if (header[i] == '\n')
246 : {
247 0 : std::string line = header.substr(start, i - start);
248 :
249 0 : if (has_prefix(line, "Content-Type:"))
250 : {
251 : // from_charset = line.substr(len);
252 0 : size_t len = strlen("Content-Type: text/plain; charset=");
253 0 : if (line.compare(0, len, "Content-Type: text/plain; charset=") == 0)
254 : {
255 0 : from_charset = line.substr(len);
256 :
257 0 : for(std::string::iterator ch = from_charset.begin(); ch != from_charset.end(); ++ch)
258 0 : *ch = static_cast<char>(toupper(*ch));
259 : }
260 : else
261 : {
262 0 : warning("malformed Content-Type header");
263 : }
264 : }
265 0 : else if (has_prefix(line, "Plural-Forms:"))
266 : {
267 0 : PluralForms plural_forms = PluralForms::from_string(line);
268 0 : if (!plural_forms)
269 : {
270 0 : warning("unknown Plural-Forms given");
271 : }
272 : else
273 : {
274 0 : if (!dict.get_plural_forms())
275 : {
276 0 : dict.set_plural_forms(plural_forms);
277 : }
278 : else
279 : {
280 0 : if (dict.get_plural_forms() != plural_forms)
281 : {
282 0 : warning("Plural-Forms missmatch between .po file and dictionary");
283 : }
284 : }
285 : }
286 : }
287 0 : start = i+1;
288 : }
289 : }
290 :
291 0 : if (from_charset.empty() || from_charset == "CHARSET")
292 : {
293 0 : warning("charset not specified for .po, fallback to utf-8");
294 0 : from_charset = "UTF-8";
295 : }
296 0 : else if (from_charset == "BIG5")
297 : {
298 0 : big5 = true;
299 : }
300 :
301 0 : conv.set_charsets(from_charset, dict.get_charset());
302 0 : }
303 :
304 : bool
305 0 : POParser::is_empty_line()
306 : {
307 0 : if (current_line.empty())
308 : {
309 0 : return true;
310 : }
311 0 : else if (current_line[0] == '#')
312 : { // handle comments as empty lines
313 0 : return (current_line.size() == 1 || (current_line.size() >= 2 && isspace(current_line[1])));
314 : }
315 : else
316 : {
317 0 : for(std::string::iterator i = current_line.begin(); i != current_line.end(); ++i)
318 : {
319 0 : if (!isspace(*i))
320 0 : return false;
321 : }
322 : }
323 0 : return true;
324 : }
325 :
326 : bool
327 0 : POParser::prefix(const char* prefix_str)
328 : {
329 0 : return current_line.compare(0, strlen(prefix_str), prefix_str) == 0;
330 : }
331 :
332 : void
333 0 : POParser::parse()
334 : {
335 0 : next_line();
336 :
337 : // skip UTF-8 intro that some text editors produce
338 : // see http://en.wikipedia.org/wiki/Byte-order_mark
339 0 : if (current_line.size() >= 3 &&
340 0 : current_line[0] == static_cast<char>(0xef) &&
341 0 : current_line[1] == static_cast<char>(0xbb) &&
342 0 : current_line[2] == static_cast<char>(0xbf))
343 : {
344 0 : current_line = current_line.substr(3);
345 : }
346 :
347 : // Parser structure
348 0 : while(!eof)
349 : {
350 : try
351 : {
352 0 : bool fuzzy = false;
353 0 : bool has_msgctxt = false;
354 0 : std::string msgctxt;
355 0 : std::string msgid;
356 :
357 0 : while(prefix("#"))
358 : {
359 0 : if (current_line.size() >= 2 && current_line[1] == ',')
360 : {
361 : // FIXME: Rather simplistic hunt for fuzzy flag
362 0 : if (current_line.find("fuzzy", 2) != std::string::npos)
363 0 : fuzzy = true;
364 : }
365 :
366 0 : next_line();
367 : }
368 :
369 0 : if (!is_empty_line())
370 : {
371 0 : if (prefix("msgctxt"))
372 : {
373 0 : has_msgctxt = true;
374 0 : msgctxt = get_string(7);
375 : }
376 :
377 0 : if (prefix("msgid"))
378 0 : msgid = get_string(5);
379 : else
380 0 : error("expected 'msgid'");
381 :
382 0 : if (prefix("msgid_plural"))
383 : {
384 0 : std::string msgid_plural = get_string(12);
385 0 : std::vector<std::string> msgstr_num;
386 0 : bool saw_nonempty_msgstr = false;
387 :
388 0 : next:
389 0 : if (is_empty_line())
390 : {
391 0 : if (msgstr_num.empty())
392 0 : error("expected 'msgstr[N] (0 <= N <= 9)'");
393 : }
394 0 : else if (prefix("msgstr[") &&
395 0 : current_line.size() > 8 &&
396 0 : isdigit(current_line[7]) && current_line[8] == ']')
397 : {
398 0 : unsigned int number = static_cast<unsigned int>(current_line[7] - '0');
399 0 : std::string msgstr = get_string(9);
400 :
401 0 : if(!msgstr.empty())
402 0 : saw_nonempty_msgstr = true;
403 :
404 0 : if (number >= msgstr_num.size())
405 0 : msgstr_num.resize(number+1);
406 :
407 0 : msgstr_num[number] = conv.convert(msgstr);
408 0 : goto next;
409 : }
410 : else
411 : {
412 0 : error("expected 'msgstr[N]'");
413 : }
414 :
415 0 : if (!is_empty_line())
416 0 : error("expected 'msgstr[N]' or empty line");
417 :
418 0 : if (saw_nonempty_msgstr)
419 : {
420 0 : if (use_fuzzy || !fuzzy)
421 : {
422 0 : if (!dict.get_plural_forms())
423 : {
424 0 : warning("msgstr[N] seen, but no Plural-Forms given");
425 : }
426 : else
427 : {
428 0 : if (msgstr_num.size() != dict.get_plural_forms().get_nplural())
429 : {
430 0 : warning("msgstr[N] count doesn't match Plural-Forms.nplural");
431 : }
432 : }
433 :
434 0 : if (has_msgctxt)
435 0 : dict.add_translation(msgctxt, msgid, msgid_plural, msgstr_num);
436 : else
437 0 : dict.add_translation(msgid, msgid_plural, msgstr_num);
438 : }
439 :
440 : if ((false))
441 : {
442 : std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl;
443 : std::cout << "msgid \"" << msgid << "\"" << std::endl;
444 : std::cout << "msgid_plural \"" << msgid_plural << "\"" << std::endl;
445 : for(std::vector<std::string>::size_type i = 0; i < msgstr_num.size(); ++i)
446 : std::cout << "msgstr[" << i << "] \"" << conv.convert(msgstr_num[i]) << "\"" << std::endl;
447 : std::cout << std::endl;
448 : }
449 : }
450 : }
451 0 : else if (prefix("msgstr"))
452 : {
453 0 : std::string msgstr = get_string(6);
454 :
455 0 : if (msgid.empty())
456 : {
457 0 : parse_header(msgstr);
458 : }
459 0 : else if(!msgstr.empty())
460 : {
461 0 : if (use_fuzzy || !fuzzy)
462 : {
463 0 : if (has_msgctxt)
464 0 : dict.add_translation(msgctxt, msgid, conv.convert(msgstr));
465 : else
466 0 : dict.add_translation(msgid, conv.convert(msgstr));
467 : }
468 :
469 : if ((false))
470 : {
471 : std::cout << (fuzzy?"fuzzy":"not-fuzzy") << std::endl;
472 : std::cout << "msgid \"" << msgid << "\"" << std::endl;
473 : std::cout << "msgstr \"" << conv.convert(msgstr) << "\"" << std::endl;
474 : std::cout << std::endl;
475 : }
476 : }
477 : }
478 : else
479 : {
480 0 : error("expected 'msgstr' or 'msgid_plural'");
481 : }
482 : }
483 :
484 0 : if (!is_empty_line())
485 0 : error("expected empty line");
486 :
487 0 : next_line();
488 : }
489 0 : catch(POParserError&)
490 : {
491 : }
492 : }
493 0 : }
494 :
495 3 : } // namespace tinygettext
496 :
497 : /* EOF */
|