bux API Reference 1.12.3
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.cpp
Go to the documentation of this file.
1#include "UnicodeCvt.h"
2#include <bit> // std::endian::*, std::byteswap()
3#include <charconv> // std::to_chars()
4#include <cstring> // memcmp()
5#include <istream> // std::istream
6#include <memory> // std::make_unique<>()
7#include <stdexcept> // std::runtime_error
8
9#ifdef _WIN32
10#pragma comment(lib, "Advapi32.lib") // IsTextUnicode()
11#include <windows.h> // Win32 API
12#else
13#include <errno.h> // errno
14#endif
15
16namespace {
17
18//
19// In-Module Types
20//
21class FC_ReadMem
22{
23public:
24
25 // Nonvirtuals
26 FC_ReadMem(std::string_view sv) noexcept: m_Src(sv.data()), m_End(sv.data()+sv.size()) {}
27 std::optional<char> operator()() noexcept
28 {
29 if (m_Src < m_End)
30 return *m_Src++;
31
32 return {};
33 }
34
35private:
36
37 // Data
38 const char *m_Src, *const m_End;
39};
40
41//
42// In-Module Constants
43//
44#ifdef _WIN32
45enum
46{
47 CHSETS_UTF8 = CP_UTF8
48};
49#else
50// shell command `iconv --list` to show available locales "in this host"
51constinit const char *const CHSETS_SJIS[] = {"CP932", "EUC-JP", "SHIFT_JIS", "SHIFT-JIS", "SJIS", 0};
52constinit const char *const CHSETS_GB[] = {"CP936", "EUC-CN", "GB18030", "GBK", 0};
53constinit const char *const CHSETS_KSC[] = {"CP949", "EUC-KR", "JOHAB", 0};
54constinit const char *const CHSETS_BIG5[] = {"CP950", "EUC-TW", "BIG5-HKSCS", "BIG5HKSCS", "BIG-5", "BIG5", 0};
55constinit const char *const CHSETS_UTF8[] = {"UTF-8", "UTF8", 0};
56constinit const char *const CHSETS_UTF7[] = {"UTF-7", "UTF7", 0};
57constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
58constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
59/* Not used due to the introduced algorithm "Identify UTF-32 wt BOM in OS-agnostic way"
60constinit const char *const CHSETS_UTF32LE[] = {"UTF-32LE", "UTF32LE", "UCS-4LE", "USC4LE", 0};
61constinit const char *const CHSETS_UTF32BE[] = {"UTF-32BE", "UTF32BE", "UCS-4BE", "USC4BE", 0};
62*/
63#endif
64
65//
66// In-Module Functions
67//
68int u32toutf8(T_Utf32 c, T_Utf8 *dst) noexcept
69{
70 int ret;
71 if (c < 0x80)
72 // 1 byte
73 {
74 ret =1;
75 *dst = T_Utf8(c);
76 }
77 else if (c < 0x800)
78 // 2 bytes
79 {
80 ret =2;
81 goto Encode;
82 }
83 else if (c < 0x10000)
84 // 3 bytes
85 {
86 ret =3;
87 goto Encode;
88 }
89 else if (c < 0x200000)
90 // 4 bytes
91 {
92 ret =4;
93 goto Encode;
94 }
95 else if (c < 0x4000000)
96 // 5 bytes
97 {
98 ret =5;
99 goto Encode;
100 }
101 else if (c < 0x80000000)
102 // 6 bytesguessCodePage()
103 {
104 ret =6;
105 goto Encode;
106 }
107 else
108 ret =-1;
109
110 return ret;
111Encode:
112 for (int i =ret; --i > 0;)
113 {
114 dst[i] = T_Utf8((c &0x3F) | 0x80);
115 c >>=6;
116 }
117 dst[0] = T_Utf8(c |((const unsigned char*)"\xC0\xE0\xF0\xF8\xFC")[ret-2]);
118 return ret;
119}
120
121} // namespace
122
123namespace bux {
124
125//
126// Constants
127//
128const T_Encoding ENCODING_UTF8 = CHSETS_UTF8;
129
130//
131// Function Defitions
132//
133std::string_view to_utf8(T_Utf32 uc)
134{
135 static thread_local T_Utf8 buf[MAX_UTF8];
136 const auto bytes = u32toutf8(uc, buf);
137 if (bytes <= 0)
138 {
139 char uc_hex[10];
140 auto ret = std::to_chars(uc_hex, uc_hex+sizeof uc_hex, uc, 16);
141 throw std::runtime_error{"u32toutf8(u+" + std::string{uc_hex,ret.ptr} + ") returns " + std::to_string(bytes)};
142 }
143 return {reinterpret_cast<char*>(buf), size_t(bytes)};
144}
145
146std::string to_utf8(C_UnicodeIn &&uin)
147{
148 T_Utf8 u8[MAX_UTF8];
149 int n;
150 std::string ret;
151 while ((n = uin.get(u8)) > 0)
152 ret.append(reinterpret_cast<char*>(u8), size_t(n));
153
154 if (n < 0)
155 throw std::runtime_error{"UTF-8 conversion error " + std::to_string(n)};
156
157 return ret;
158}
159
160//
161// Class Implementations
162//
164 m_Src(std::move(readc)),
165 m_CodePage(codepage)
166{
167 init();
168}
169
170C_UnicodeIn::C_UnicodeIn(std::string_view sv, T_Encoding codepage):
171 C_UnicodeIn(FC_ReadMem(sv), codepage)
172{
173}
174
175C_UnicodeIn::C_UnicodeIn(std::istream &in, T_Encoding codepage):
176 m_Src([&]()->std::optional<char> {
177 char ch;
178 if (static_cast<bool>(in.get(ch)))
179 return ch;
180 return {};
181 }),
182 m_CodePage(codepage)
199{
200 init();
201}
202
204{
205#ifndef _WIN32
206 reset_iconv();
207#endif
208}
209
211{
212 if (m_GetQ.empty())
213 {
214 if (m_ErrCode < 0)
215 // Error code persistsiconv --list
216 return m_ErrCode;
217
218 if (m_ReadMethod)
219 (this->*m_ReadMethod)();
220
221 if (lastError() <= 0)
222 // Error happens
223 return m_ErrCode;
224 }
225 c = m_GetQ.front();
226 m_GetQ.pop();
227 return 1;
228}
229
231{
232 T_Utf32 c;
233 int ret = get(c);
234 if (ret > 0)
235 {
236 if (c < 0x10000)
237 // Encode into a single word
238 *dst =T_Utf16(c);
239 else
240 // Encode into two words
241 {
242 ret =2;
243 c -=0x10000;
244 dst[0] =T_Utf16((c >>10) |0xD800);
245 dst[1] =T_Utf16((c &0x3FF) |0xDC00);
246 }
247 }
248 return ret;
249}
250
252{
253 T_Utf32 c;
254 int ret = get(c);
255 if (ret > 0)
256 ret = u32toutf8(c, dst);
257
258 return ret;
259}
260
261void C_UnicodeIn::ingestMBCS()
264{
265 if (auto size = m_Src.size())
266 {
267#ifdef _WIN32
268 const auto utf16 = std::make_unique<wchar_t[]>(size);
269 if (int wn = MultiByteToWideChar(m_CodePage, MB_ERR_INVALID_CHARS, m_Src.buffer(), int(size), utf16.get(), int(size)))
270 {
271 FC_ReadMem read({reinterpret_cast<char*>(utf16.get()), size_t(wn*2)});
272 C_Source src(std::move(read));
273 while (readUTF16(src, false));
274 m_Src.pop(size);
275 }
276 else
277 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
278#else
279 static constinit const char *const TO_UCS4 = std::endian::native == std::endian::little? "UCS-4LE": "UCS-4BE";
280 static_assert(std::endian::native == std::endian::little || std::endian::native == std::endian::big);
281 for (T_Encoding i = m_CodePage; *i && m_iconv == (iconv_t)(-1); ++i)
282 m_iconv = iconv_open(TO_UCS4, *i);
283
284 if (m_iconv == (iconv_t)(-1))
285 // Fail to initialize the corresponding iconv_t descriptor
286 {
287 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
288 return;
289 }
290 const auto ucs4 = std::make_unique<T_Utf32[]>(size);
291 size_t size_ucs4 = size * 4;
292 auto src = const_cast<char*>(m_Src.buffer());
293 auto dst = reinterpret_cast<char*>(ucs4.get());
294 if (size_t(-1) != iconv(m_iconv, &src, &size, &dst, &size_ucs4))
295 // Fully converted
296 {
297 for (const T_Utf32 *i = ucs4.get(); i < reinterpret_cast<T_Utf32*>(dst); m_GetQ.push(*i++));
298 m_Src.pop(m_Src.size());
299 }
300 else switch (errno)
301 {
302 case EILSEQ: // invalid multibyte sequence
303 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
304 break;
305 case EINVAL: // incomplete multibyte sequence
306 for (const T_Utf32 *i = ucs4.get(); i < reinterpret_cast<T_Utf32*>(dst); m_GetQ.push(*i++));
307 m_Src.pop(m_Src.size()-size);
308 break;
309 case E2BIG: // output buffer overflow, which is impossible.
310 default:
311 m_ErrCode = UIE_INTERNAL;
312 }
313#endif
314 }
315}
316
317void C_UnicodeIn::init()
318{
319 // BOM encodings from https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
320 m_Src.read(4);
321 switch (m_Src.size())
322 {
323 case 4:
324 switch (m_Src.getUtf32(0, false))
325 {
326 case 0xFEFF: // UTF-32 with BOM
327 m_Src.pop(4);
328 m_ReadMethod = &C_UnicodeIn::readUTF32;
329 return;
330 case 0xFFFE0000: // Reverse UTF-32 with BOM
331 m_Src.pop(4);
332 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
333 return;
334 }
335 [[fallthrough]];
336 case 3:
337 case 2:
338 switch (m_Src.getUtf16(0, false))
339 {
340 case 0xFEFF: // UTF-16 with BOM
341 m_Src.pop(2);
342 m_ReadMethod = &C_UnicodeIn::readUTF16;
343 return;
344 case 0xFFFE: // Reverse UTF-16 with BOM
345 m_Src.pop(2);
346 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
347 return;
348 default:
349 if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8"\uFEFF", 3))
350 // UTF-8 with BOM
351 {
352 m_Src.pop(3);
353 setCodePage(CHSETS_UTF8);
354 m_ReadMethod = &C_UnicodeIn::readCodePage;
355 return;
356 }
357
358 // Infer the encoding from first-1000-bytes chunk (UTF-8, ACP, or something else ?)
359 if (!m_CodePage)
360 {
361 m_Src.read(1000);
362 const auto size = m_Src.size();
363
364 // Identify UTF-32 wt BOM in OS-agnostic way
365 if (size && size % 4 == 0)
366 {
367 size_t n_u32_chars = 0, n_u32rev_chars = 0;
368 const auto p_dwords = reinterpret_cast<const T_Utf32*>(m_Src.buffer());
369 const size_t n = size / 4;
370 for (size_t i = 0; i < n; ++i)
371 {
372 if (auto u32 = p_dwords[i])
373 {
374 bool matched = false;
375 if (u32 == (u32 & 0xFFFFFF))
376 {
377 matched = true;
378 ++n_u32_chars;
379 }
380 //-----------------------------
381 u32 = std::byteswap(u32);
382 if (u32 == (u32 & 0xFFFFFF))
383 {
384 matched = true;
385 ++n_u32rev_chars;
386 }
387 if (!matched)
388 goto PostCheckUTF32;
389 }
390 }
391
392 const auto n_overflow = n_u32_chars + n_u32rev_chars - n;
393 if (n_u32_chars <= n_overflow)
394 {
395 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
396 return;
397 }
398 if (n_u32rev_chars <= n_overflow)
399 {
400 m_ReadMethod = &C_UnicodeIn::readUTF32;
401 return;
402 }
403 }
404
405 PostCheckUTF32:
406#ifdef _WIN32
407 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
408 if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
409 {
410 m_ReadMethod = &C_UnicodeIn::readUTF16;
411 return;
412 }
413 mask = IS_TEXT_UNICODE_REVERSE_MASK;
414 if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
415 {
416 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
417 return;
418 }
419#endif
420 // Guess harder
421 if (guessCodePage())
422 return;
423 }
424
425 // Read heading asciis off then we can distinguish CP_UTF8 from CP_ACP.
426 m_ReadMethod = &C_UnicodeIn::readASCII;
427 }
428 break;
429 case 1: // has to be ascii
430 if (*m_Src.buffer() &0x80)
431 m_ErrCode = UIE_INCOMPLETE_UNICODE;
432 else
433 m_GetQ.push(T_Utf8(*m_Src.buffer()));
434 break;
435 case 0: // empty string
436 break;
437 default:
438 m_ErrCode = UIE_INTERNAL;
439 }
440}
441
442void C_UnicodeIn::readASCII()
443{
444 m_Src.read(1);
445 if (m_Src.size())
446 {
447 const auto c = T_Utf8(*m_Src.buffer());
448#ifdef _WIN32
449 if (!(c &0x80))
450#else
451 if (!(c &0x80) && c)
452#endif
453 {
454 // Still an ASCII
455 m_GetQ.push(c);
456 return m_Src.pop(1);
457 }
458 }
459
460 // Is the rest CP_UTF8 or CP_ACP ?
461 m_Src.readTillCtrl();
462 if (m_CodePage)
463 {
464 ingestMBCS();
465 if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
466 // Should be m_CodePage
467 {
468 m_ReadMethod = &C_UnicodeIn::readCodePage;
469 return;
470 }
471 }
472 guessCodePage();
473}
474
475bool C_UnicodeIn::guessCodePage()
476{
477 if (m_Src.size() < 2) [[unlikely]]
478 return false;
479
480 static constinit const T_Encoding MBCS_CODEPAGES[] ={
481#ifdef _WIN32
482 CP_UTF8, CP_ACP,
483 932, 936, 949, 950, 951, // from https://en.wikipedia.org/wiki/Windows_code_page#East_Asian_multi-byte_code_pages
484 CP_UTF7
485#else
486 CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
487#endif
488 };
489 for (auto i: MBCS_CODEPAGES)
490 {
491 m_ErrCode = UIE_EOF; // reset error code
492 setCodePage(i);
493 ingestMBCS();
494 if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
495 {
496 m_ReadMethod = &C_UnicodeIn::readCodePage;
497 return true;
498 }
499 }
500 return false;
501}
502
503bool C_UnicodeIn::readUTF16(C_Source &src, bool reverseWord)
504{
505 bool ret = false;
506 src.read(2);
507 const size_t read = src.size();
508 if (read >= 2)
509 {
510 const auto uc = src.getUtf16(0,reverseWord);
511 if (0xD800 <= uc && uc < 0xDC00)
512 // Hi word of 2-word encoding
513 {
514 src.read(4);
515 if (src.size() >= 4)
516 {
517 const T_Utf16 uc2 = src.getUtf16(1,reverseWord);
518 if (0xDC00 <= uc2 && uc2 < 0xE000)
519 // Low word of 2-word encoding
520 {
521 src.pop(4);
522 m_GetQ.push(T_Utf32((((uc&0x3FF)<<10)|(uc2&0x3FF))+0x10000));
523 ret =true;
524 }
525 else
526 // Anything lese is ill-formed
527 m_ErrCode = UIE_ILLFORMED_UNICODE;
528 }
529 else
530 m_ErrCode = UIE_INCOMPLETE_UNICODE;
531 }
532 else if (0xDC00 <= uc && uc < 0xE000)
533 // Low word of 2-word encoding - ill-fomed
534 m_ErrCode = UIE_ILLFORMED_UNICODE;
535 else
536 {
537 src.pop(2);
538 m_GetQ.push(uc);
539 ret = true;
540 }
541 }
542 else if (read > 0)
543 m_ErrCode = UIE_INCOMPLETE_UNICODE;
544
545 return ret;
546}
547
548void C_UnicodeIn::readUTF16()
549{
550 readUTF16(m_Src, false);
551}
552
553void C_UnicodeIn::readReverseUTF16()
554{
555 readUTF16(m_Src, true);
556}
557
558bool C_UnicodeIn::readUTF32(C_Source &src, bool reverseWord)
559{
560 bool ret = false;
561 src.read(4);
562 const size_t read = src.size();
563 if (read >= 4)
564 {
565 m_GetQ.push(src.getUtf32(0,reverseWord));
566 src.pop(4);
567 ret = true;
568 }
569 else if (read > 0)
570 m_ErrCode = UIE_INCOMPLETE_UNICODE;
571
572 return ret;
573}
574
575void C_UnicodeIn::readUTF32()
576{
577 readUTF32(m_Src, false);
578}
579
580void C_UnicodeIn::readReverseUTF32()
581{
582 readUTF32(m_Src, true);
583}
584
585void C_UnicodeIn::readCodePage()
586{
587 m_Src.readTillCtrl();
588 ingestMBCS();
589}
590
591void C_UnicodeIn::setCodePage(T_Encoding cp)
592{
593 m_CodePage = cp;
594#ifndef _WIN32
595 reset_iconv();
596#endif
597}
598
599#ifndef _WIN32
600void C_UnicodeIn::reset_iconv()
601{
602 if (m_iconv != (iconv_t)(-1))
603 {
604 iconv_close(m_iconv);
605 m_iconv = (iconv_t)(-1);
606 }
607}
608#endif
609
610C_UnicodeIn::C_Source::C_Source(FH_ReadChar &&readc) noexcept:
611 m_ReadCh(std::move(readc)),
612 m_AvailBeg(0)
613{
614}
615
616const char *C_UnicodeIn::C_Source::buffer() const noexcept
617{
618 return m_ReadBuf.data() + m_AvailBeg;
619}
620
621T_Utf16 C_UnicodeIn::C_Source::getUtf16(size_t pos, bool reverseWord) const
622{
623 const size_t off = m_AvailBeg + pos * 2;
624 if (off + 2 > m_ReadBuf.size())
625 throw std::runtime_error{"End of char " + std::to_string(off+2) + " passes end of buffer"};
626
627 auto ret = *reinterpret_cast<const T_Utf16*>(m_ReadBuf.data() + off);
628 return reverseWord? std::byteswap(ret): ret;
629}
630
631T_Utf32 C_UnicodeIn::C_Source::getUtf32(size_t pos, bool reverseWord) const
632{
633 const size_t off = m_AvailBeg + pos * 4;
634 if (off + 4 > m_ReadBuf.size())
635 throw std::runtime_error{"End of char " + std::to_string(off+4) + " passes end of buffer"};
636
637 auto ret = *reinterpret_cast<const T_Utf32*>(m_ReadBuf.data() + off);
638 return reverseWord? std::byteswap(ret): ret;
639}
640
641void C_UnicodeIn::C_Source::pop(size_t bytes)
642{
643 m_AvailBeg += bytes;
644 if (m_AvailBeg > m_ReadBuf.size())
645 {
646 m_AvailBeg -= bytes; // rollback
647 throw std::runtime_error{"m_AvailBeg overflow"};
648 }
649}
650
651void C_UnicodeIn::C_Source::read(size_t bytes)
652{
653 if (m_AvailBeg + bytes > m_ReadBuf.size())
654 {
655 if (m_AvailBeg)
656 {
657 m_ReadBuf.erase(0, m_AvailBeg);
658 m_AvailBeg = 0;
659 }
660 bytes -= m_ReadBuf.size();
661 for (size_t i = 0; i < bytes; ++i)
662 if (auto c = m_ReadCh())
663 m_ReadBuf += *c;
664 else
665 break;
666 }
667}
668
669void C_UnicodeIn::C_Source::readTillCtrl()
670{
671 if (m_AvailBeg == m_ReadBuf.size())
672 {
673 m_ReadBuf.clear();
674 m_AvailBeg = 0;
675 }
676
677 while (auto c = m_ReadCh())
678 {
679 m_ReadBuf += *c;
680 if (0 == (*c &0xE0))
681 // Control char
682 break;
683 }
684}
685
686size_t C_UnicodeIn::C_Source::size() const noexcept
687{
688 return m_ReadBuf.size() - m_AvailBeg;
689}
690
691} // namespace bux
int get(T_Utf32 &c)
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:71
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
Definition UnicodeCvt.h:52
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:47
void read(const std::string &src, size_t &off, T &data) noexcept
Definition Serialize.h:35
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:40
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:41
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_INTERNAL
Definition UnicodeCvt.h:33
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:39