bux API Reference 1.11.0
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.cpp
Go to the documentation of this file.
1#include "UnicodeCvt.h"
2#include "XException.h" // RUNTIME_ERROR()
3#include <bit> // std::endian::*, std::byteswap()
4#include <cstring> // memcmp()
5#include <format> // std::format()
6#include <istream> // std::istream
7#include <memory> // std::make_unique<>()
8
9#ifdef _WIN32
10#pragma comment(lib, "Advapi32.lib") // IsTextUnicode()
11#include <windows.h> // Win32 API
12#else
13#include <errno.h> // errno
14#endif
15
16namespace {
17
18//
19// In-Module Types
20//
21class FC_ReadMem
22{
23public:
24
25 // Nonvirtuals
26 FC_ReadMem(std::string_view sv) noexcept: m_Src(sv.data()), m_End(sv.data()+sv.size()) {}
27 std::optional<char> operator()() noexcept
28 {
29 if (m_Src < m_End)
30 return *m_Src++;
31
32 return {};
33 }
34
35private:
36
37 // Data
38 const char *m_Src, *const m_End;
39};
40
41//
42// In-Module Constants
43//
44#ifdef _WIN32
45enum
46{
47 CHSETS_UTF8 = CP_UTF8
48};
49#else
50// shell command `iconv --list` to show available locales "in this host"
51constinit const char *const CHSETS_SJIS[] = {"CP932", "EUC-JP", "SHIFT_JIS", "SHIFT-JIS", "SJIS", 0};
52constinit const char *const CHSETS_GB[] = {"CP936", "EUC-CN", "GB18030", "GBK", 0};
53constinit const char *const CHSETS_KSC[] = {"CP949", "EUC-KR", "JOHAB", 0};
54constinit const char *const CHSETS_BIG5[] = {"CP950", "EUC-TW", "BIG5-HKSCS", "BIG5HKSCS", "BIG-5", "BIG5", 0};
55constinit const char *const CHSETS_UTF8[] = {"UTF-8", "UTF8", 0};
56constinit const char *const CHSETS_UTF7[] = {"UTF-7", "UTF7", 0};
57constinit const char *const CHSETS_UTF16LE[] = {"UTF-16LE", "UTF16LE", "UCS-2LE", "USC2LE", 0};
58constinit const char *const CHSETS_UTF16BE[] = {"UTF-16BE", "UTF16BE", "UCS-2BE", "USC2BE", 0};
59/* Not used due to the introduced algorithm "Identify UTF-32 wt BOM in OS-agnostic way"
60constinit const char *const CHSETS_UTF32LE[] = {"UTF-32LE", "UTF32LE", "UCS-4LE", "USC4LE", 0};
61constinit const char *const CHSETS_UTF32BE[] = {"UTF-32BE", "UTF32BE", "UCS-4BE", "USC4BE", 0};
62*/
63#endif
64
65//
66// In-Module Functions
67//
68int u32toutf8(T_Utf32 c, T_Utf8 *dst) noexcept
69{
70 int ret;
71 if (c < 0x80)
72 // 1 byte
73 {
74 ret =1;
75 *dst = T_Utf8(c);
76 }
77 else if (c < 0x800)
78 // 2 bytes
79 {
80 ret =2;
81 goto Encode;
82 }
83 else if (c < 0x10000)
84 // 3 bytes
85 {
86 ret =3;
87 goto Encode;
88 }
89 else if (c < 0x200000)
90 // 4 bytes
91 {
92 ret =4;
93 goto Encode;
94 }
95 else if (c < 0x4000000)
96 // 5 bytes
97 {
98 ret =5;
99 goto Encode;
100 }
101 else if (c < 0x80000000)
102 // 6 bytesguessCodePage()
103 {
104 ret =6;
105 goto Encode;
106 }
107 else
108 ret =-1;
109
110 return ret;
111Encode:
112 for (int i =ret; --i > 0;)
113 {
114 dst[i] = T_Utf8((c &0x3F) | 0x80);
115 c >>=6;
116 }
117 dst[0] = T_Utf8(c |((const unsigned char*)"\xC0\xE0\xF0\xF8\xFC")[ret-2]);
118 return ret;
119}
120
121} // namespace
122
123namespace bux {
124
125//
126// Constants
127//
128const T_Encoding ENCODING_UTF8 = CHSETS_UTF8;
129
130//
131// Function Defitions
132//
133std::string_view to_utf8(T_Utf32 uc)
134{
135 static thread_local T_Utf8 buf[MAX_UTF8];
136 const auto bytes = u32toutf8(uc, buf);
137 if (bytes <= 0)
138 RUNTIME_ERROR("u32toutf8(u+{:x}) returns {}", uc, bytes);
139
140 return {reinterpret_cast<char*>(buf), size_t(bytes)};
141}
142
143std::string to_utf8(C_UnicodeIn &&uin)
144{
145 T_Utf8 u8[MAX_UTF8];
146 int n;
147 std::string ret;
148 while ((n = uin.get(u8)) > 0)
149 ret.append(reinterpret_cast<char*>(u8), size_t(n));
150
151 if (n < 0)
152 RUNTIME_ERROR("UTF-8 conversion error {}", n);
153
154 return ret;
155}
156
157//
158// Class Implementations
159//
161 m_Src(std::move(readc)),
162 m_CodePage(codepage)
163{
164 init();
165}
166
167C_UnicodeIn::C_UnicodeIn(std::string_view sv, T_Encoding codepage):
168 C_UnicodeIn(FC_ReadMem(sv), codepage)
169{
170}
171
172C_UnicodeIn::C_UnicodeIn(std::istream &in, T_Encoding codepage):
173 m_Src([&]()->std::optional<char> {
174 char ch;
175 if (static_cast<bool>(in.get(ch)))
176 return ch;
177 return {};
178 }),
179 m_CodePage(codepage)
196{
197 init();
198}
199
201{
202#ifndef _WIN32
203 reset_iconv();
204#endif
205}
206
208{
209 if (m_GetQ.empty())
210 {
211 if (m_ErrCode < 0)
212 // Error code persistsiconv --list
213 return m_ErrCode;
214
215 if (m_ReadMethod)
216 (this->*m_ReadMethod)();
217
218 if (lastError() <= 0)
219 // Error happens
220 return m_ErrCode;
221 }
222 c = m_GetQ.front();
223 m_GetQ.pop();
224 return 1;
225}
226
228{
229 T_Utf32 c;
230 int ret = get(c);
231 if (ret > 0)
232 {
233 if (c < 0x10000)
234 // Encode into a single word
235 *dst =T_Utf16(c);
236 else
237 // Encode into two words
238 {
239 ret =2;
240 c -=0x10000;
241 dst[0] =T_Utf16((c >>10) |0xD800);
242 dst[1] =T_Utf16((c &0x3FF) |0xDC00);
243 }
244 }
245 return ret;
246}
247
249{
250 T_Utf32 c;
251 int ret = get(c);
252 if (ret > 0)
253 ret = u32toutf8(c, dst);
254
255 return ret;
256}
257
258void C_UnicodeIn::ingestMBCS()
261{
262 if (auto size = m_Src.size())
263 {
264#ifdef _WIN32
265 const auto utf16 = std::make_unique<wchar_t[]>(size);
266 if (int wn = MultiByteToWideChar(m_CodePage, MB_ERR_INVALID_CHARS, m_Src.buffer(), int(size), utf16.get(), int(size)))
267 {
268 FC_ReadMem read({reinterpret_cast<char*>(utf16.get()), size_t(wn*2)});
269 C_Source src(std::move(read));
270 while (readUTF16(src, false));
271 m_Src.pop(size);
272 }
273 else
274 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
275#else
276 static constinit const char *const TO_UCS4 = std::endian::native == std::endian::little? "UCS-4LE": "UCS-4BE";
277 static_assert(std::endian::native == std::endian::little || std::endian::native == std::endian::big);
278 for (T_Encoding i = m_CodePage; *i && m_iconv == (iconv_t)(-1); ++i)
279 m_iconv = iconv_open(TO_UCS4, *i);
280
281 if (m_iconv == (iconv_t)(-1))
282 // Fail to initialize the corresponding iconv_t descriptor
283 {
284 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
285 return;
286 }
287 const auto ucs4 = std::make_unique<T_Utf32[]>(size);
288 size_t size_ucs4 = size * 4;
289 auto src = const_cast<char*>(m_Src.buffer());
290 auto dst = reinterpret_cast<char*>(ucs4.get());
291 if (size_t(-1) != iconv(m_iconv, &src, &size, &dst, &size_ucs4))
292 // Fully converted
293 {
294 for (const T_Utf32 *i = ucs4.get(); i < reinterpret_cast<T_Utf32*>(dst); m_GetQ.push(*i++));
295 m_Src.pop(m_Src.size());
296 }
297 else switch (errno)
298 {
299 case EILSEQ: // invalid multibyte sequence
300 m_ErrCode = UIE_NO_UNICODE_TRANSLATION;
301 break;
302 case EINVAL: // incomplete multibyte sequence
303 for (const T_Utf32 *i = ucs4.get(); i < reinterpret_cast<T_Utf32*>(dst); m_GetQ.push(*i++));
304 m_Src.pop(m_Src.size()-size);
305 break;
306 case E2BIG: // output buffer overflow, which is impossible.
307 default:
308 m_ErrCode = UIE_INTERNAL;
309 }
310#endif
311 }
312}
313
314void C_UnicodeIn::init()
315{
316 // BOM encodings from https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
317 m_Src.read(4);
318 switch (m_Src.size())
319 {
320 case 4:
321 switch (m_Src.getUtf32(0, false))
322 {
323 case 0xFEFF: // UTF-32 with BOM
324 m_Src.pop(4);
325 m_ReadMethod = &C_UnicodeIn::readUTF32;
326 return;
327 case 0xFFFE0000: // Reverse UTF-32 with BOM
328 m_Src.pop(4);
329 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
330 return;
331 }
332 [[fallthrough]];
333 case 3:
334 case 2:
335 switch (m_Src.getUtf16(0, false))
336 {
337 case 0xFEFF: // UTF-16 with BOM
338 m_Src.pop(2);
339 m_ReadMethod = &C_UnicodeIn::readUTF16;
340 return;
341 case 0xFFFE: // Reverse UTF-16 with BOM
342 m_Src.pop(2);
343 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
344 return;
345 default:
346 if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8"\uFEFF", 3))
347 // UTF-8 with BOM
348 {
349 m_Src.pop(3);
350 setCodePage(CHSETS_UTF8);
351 m_ReadMethod = &C_UnicodeIn::readCodePage;
352 return;
353 }
354
355 // Infer the encoding from first-1000-bytes chunk (UTF-8, ACP, or something else ?)
356 if (!m_CodePage)
357 {
358 m_Src.read(1000);
359 const auto size = m_Src.size();
360
361 // Identify UTF-32 wt BOM in OS-agnostic way
362 if (size && size % 4 == 0)
363 {
364 size_t n_u32_chars = 0, n_u32rev_chars = 0;
365 const auto p_dwords = reinterpret_cast<const T_Utf32*>(m_Src.buffer());
366 const size_t n = size / 4;
367 for (size_t i = 0; i < n; ++i)
368 {
369 if (auto u32 = p_dwords[i])
370 {
371 bool matched = false;
372 if (u32 == (u32 & 0xFFFFFF))
373 {
374 matched = true;
375 ++n_u32_chars;
376 }
377 //-----------------------------
378 u32 = std::byteswap(u32);
379 if (u32 == (u32 & 0xFFFFFF))
380 {
381 matched = true;
382 ++n_u32rev_chars;
383 }
384 if (!matched)
385 goto PostCheckUTF32;
386 }
387 }
388
389 const auto n_overflow = n_u32_chars + n_u32rev_chars - n;
390 if (n_u32_chars <= n_overflow)
391 {
392 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
393 return;
394 }
395 if (n_u32rev_chars <= n_overflow)
396 {
397 m_ReadMethod = &C_UnicodeIn::readUTF32;
398 return;
399 }
400 }
401
402 PostCheckUTF32:
403#ifdef _WIN32
404 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
405 if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
406 {
407 m_ReadMethod = &C_UnicodeIn::readUTF16;
408 return;
409 }
410 mask = IS_TEXT_UNICODE_REVERSE_MASK;
411 if (IsTextUnicode(m_Src.buffer(), int(size), &mask) || mask)
412 {
413 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
414 return;
415 }
416#endif
417 // Guess harder
418 if (guessCodePage())
419 return;
420 }
421
422 // Read heading asciis off then we can distinguish CP_UTF8 from CP_ACP.
423 m_ReadMethod = &C_UnicodeIn::readASCII;
424 }
425 break;
426 case 1: // has to be ascii
427 if (*m_Src.buffer() &0x80)
428 m_ErrCode = UIE_INCOMPLETE_UNICODE;
429 else
430 m_GetQ.push(T_Utf8(*m_Src.buffer()));
431 break;
432 case 0: // empty string
433 break;
434 default:
435 m_ErrCode = UIE_INTERNAL;
436 }
437}
438
439void C_UnicodeIn::readASCII()
440{
441 m_Src.read(1);
442 if (m_Src.size())
443 {
444 const auto c = T_Utf8(*m_Src.buffer());
445#ifdef _WIN32
446 if (!(c &0x80))
447#else
448 if (!(c &0x80) && c)
449#endif
450 {
451 // Still an ASCII
452 m_GetQ.push(c);
453 return m_Src.pop(1);
454 }
455 }
456
457 // Is the rest CP_UTF8 or CP_ACP ?
458 m_Src.readTillCtrl();
459 if (m_CodePage)
460 {
461 ingestMBCS();
462 if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
463 // Should be m_CodePage
464 {
465 m_ReadMethod = &C_UnicodeIn::readCodePage;
466 return;
467 }
468 }
469 guessCodePage();
470}
471
472bool C_UnicodeIn::guessCodePage()
473{
474 if (m_Src.size() < 2) [[unlikely]]
475 return false;
476
477 static constinit const T_Encoding MBCS_CODEPAGES[] ={
478#ifdef _WIN32
479 CP_UTF8, CP_ACP,
480 932, 936, 949, 950, 951, // from https://en.wikipedia.org/wiki/Windows_code_page#East_Asian_multi-byte_code_pages
481 CP_UTF7
482#else
483 CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
484#endif
485 };
486 for (auto i: MBCS_CODEPAGES)
487 {
488 m_ErrCode = UIE_EOF; // reset error code
489 setCodePage(i);
490 ingestMBCS();
491 if (m_ErrCode != UIE_NO_UNICODE_TRANSLATION)
492 {
493 m_ReadMethod = &C_UnicodeIn::readCodePage;
494 return true;
495 }
496 }
497 return false;
498}
499
500bool C_UnicodeIn::readUTF16(C_Source &src, bool reverseWord)
501{
502 bool ret = false;
503 src.read(2);
504 const size_t read = src.size();
505 if (read >= 2)
506 {
507 const auto uc = src.getUtf16(0,reverseWord);
508 if (0xD800 <= uc && uc < 0xDC00)
509 // Hi word of 2-word encoding
510 {
511 src.read(4);
512 if (src.size() >= 4)
513 {
514 const T_Utf16 uc2 = src.getUtf16(1,reverseWord);
515 if (0xDC00 <= uc2 && uc2 < 0xE000)
516 // Low word of 2-word encoding
517 {
518 src.pop(4);
519 m_GetQ.push(T_Utf32((((uc&0x3FF)<<10)|(uc2&0x3FF))+0x10000));
520 ret =true;
521 }
522 else
523 // Anything lese is ill-formed
524 m_ErrCode = UIE_ILLFORMED_UNICODE;
525 }
526 else
527 m_ErrCode = UIE_INCOMPLETE_UNICODE;
528 }
529 else if (0xDC00 <= uc && uc < 0xE000)
530 // Low word of 2-word encoding - ill-fomed
531 m_ErrCode = UIE_ILLFORMED_UNICODE;
532 else
533 {
534 src.pop(2);
535 m_GetQ.push(uc);
536 ret = true;
537 }
538 }
539 else if (read > 0)
540 m_ErrCode = UIE_INCOMPLETE_UNICODE;
541
542 return ret;
543}
544
545void C_UnicodeIn::readUTF16()
546{
547 readUTF16(m_Src, false);
548}
549
550void C_UnicodeIn::readReverseUTF16()
551{
552 readUTF16(m_Src, true);
553}
554
555bool C_UnicodeIn::readUTF32(C_Source &src, bool reverseWord)
556{
557 bool ret = false;
558 src.read(4);
559 const size_t read = src.size();
560 if (read >= 4)
561 {
562 m_GetQ.push(src.getUtf32(0,reverseWord));
563 src.pop(4);
564 ret = true;
565 }
566 else if (read > 0)
567 m_ErrCode = UIE_INCOMPLETE_UNICODE;
568
569 return ret;
570}
571
572void C_UnicodeIn::readUTF32()
573{
574 readUTF32(m_Src, false);
575}
576
577void C_UnicodeIn::readReverseUTF32()
578{
579 readUTF32(m_Src, true);
580}
581
582void C_UnicodeIn::readCodePage()
583{
584 m_Src.readTillCtrl();
585 ingestMBCS();
586}
587
588void C_UnicodeIn::setCodePage(T_Encoding cp)
589{
590 m_CodePage = cp;
591#ifndef _WIN32
592 reset_iconv();
593#endif
594}
595
596#ifndef _WIN32
597void C_UnicodeIn::reset_iconv()
598{
599 if (m_iconv != (iconv_t)(-1))
600 {
601 iconv_close(m_iconv);
602 m_iconv = (iconv_t)(-1);
603 }
604}
605#endif
606
607C_UnicodeIn::C_Source::C_Source(FH_ReadChar &&readc) noexcept:
608 m_ReadCh(std::move(readc)),
609 m_AvailBeg(0)
610{
611}
612
613const char *C_UnicodeIn::C_Source::buffer() const noexcept
614{
615 return m_ReadBuf.data() + m_AvailBeg;
616}
617
618T_Utf16 C_UnicodeIn::C_Source::getUtf16(size_t pos, bool reverseWord) const
619{
620 const size_t off = m_AvailBeg + pos * 2;
621 if (off + 2 > m_ReadBuf.size())
622 RUNTIME_ERROR("End of char {} passes end of buffer", off+2);
623
624 auto ret = *reinterpret_cast<const T_Utf16*>(m_ReadBuf.data() + off);
625 return reverseWord? std::byteswap(ret): ret;
626}
627
628T_Utf32 C_UnicodeIn::C_Source::getUtf32(size_t pos, bool reverseWord) const
629{
630 const size_t off = m_AvailBeg + pos * 4;
631 if (off + 4 > m_ReadBuf.size())
632 RUNTIME_ERROR("End of char {} passes end of buffer", off+4);
633
634 auto ret = *reinterpret_cast<const T_Utf32*>(m_ReadBuf.data() + off);
635 return reverseWord? std::byteswap(ret): ret;
636}
637
638void C_UnicodeIn::C_Source::pop(size_t bytes)
639{
640 m_AvailBeg += bytes;
641 if (m_AvailBeg > m_ReadBuf.size())
642 {
643 m_AvailBeg -= bytes; // rollback
644 RUNTIME_ERROR("m_AvailBeg overflow");
645 }
646}
647
648void C_UnicodeIn::C_Source::read(size_t bytes)
649{
650 if (m_AvailBeg + bytes > m_ReadBuf.size())
651 {
652 if (m_AvailBeg)
653 {
654 m_ReadBuf.erase(0, m_AvailBeg);
655 m_AvailBeg = 0;
656 }
657 bytes -= m_ReadBuf.size();
658 for (size_t i = 0; i < bytes; ++i)
659 if (auto c = m_ReadCh())
660 m_ReadBuf += *c;
661 else
662 break;
663 }
664}
665
666void C_UnicodeIn::C_Source::readTillCtrl()
667{
668 if (m_AvailBeg == m_ReadBuf.size())
669 {
670 m_ReadBuf.clear();
671 m_AvailBeg = 0;
672 }
673
674 while (auto c = m_ReadCh())
675 {
676 m_ReadBuf += *c;
677 if (0 == (*c &0xE0))
678 // Control char
679 break;
680 }
681}
682
683size_t C_UnicodeIn::C_Source::size() const noexcept
684{
685 return m_ReadBuf.size() - m_AvailBeg;
686}
687
688} // namespace bux
#define RUNTIME_ERROR(fmtStr,...)
Wrap FILE(DATE)#__LINE__ FUNCTION: msg into std::runtime_error.
Definition XException.h:32
int get(T_Utf32 &c)
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:67
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
Definition UnicodeCvt.h:48
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:43
void read(const std::string &src, size_t &off, T &data) noexcept
Definition Serialize.h:35
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:40
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:41
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_INTERNAL
Definition UnicodeCvt.h:33
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:39