10#pragma comment(lib, "Advapi32.lib")
26 FC_ReadMem(std::string_view sv)
noexcept: m_Src(sv.data()), m_End(sv.data()+sv.size()) {}
27 std::optional<char> operator()() noexcept
38 const char *m_Src, *
const m_End;
51constinit const char *
const CHSETS_SJIS[] = {
"CP932",
"EUC-JP",
"SHIFT_JIS",
"SHIFT-JIS",
"SJIS", 0};
52constinit const char *
const CHSETS_GB[] = {
"CP936",
"EUC-CN",
"GB18030",
"GBK", 0};
53constinit const char *
const CHSETS_KSC[] = {
"CP949",
"EUC-KR",
"JOHAB", 0};
54constinit const char *
const CHSETS_BIG5[] = {
"CP950",
"EUC-TW",
"BIG5-HKSCS",
"BIG5HKSCS",
"BIG-5",
"BIG5", 0};
55constinit const char *
const CHSETS_UTF8[] = {
"UTF-8",
"UTF8", 0};
56constinit const char *
const CHSETS_UTF7[] = {
"UTF-7",
"UTF7", 0};
57constinit const char *
const CHSETS_UTF16LE[] = {
"UTF-16LE",
"UTF16LE",
"UCS-2LE",
"USC2LE", 0};
58constinit const char *
const CHSETS_UTF16BE[] = {
"UTF-16BE",
"UTF16BE",
"UCS-2BE",
"USC2BE", 0};
89 else if (c < 0x200000)
95 else if (c < 0x4000000)
101 else if (c < 0x80000000)
112 for (
int i =ret; --i > 0;)
114 dst[i] =
T_Utf8((c &0x3F) | 0x80);
117 dst[0] =
T_Utf8(c |((
const unsigned char*)
"\xC0\xE0\xF0\xF8\xFC")[ret-2]);
136 const auto bytes = u32toutf8(uc, buf);
140 auto ret = std::to_chars(uc_hex, uc_hex+
sizeof uc_hex, uc, 16);
141 throw std::runtime_error{
"u32toutf8(u+" + std::string{uc_hex,ret.ptr} +
") returns " + std::to_string(bytes)};
143 return {
reinterpret_cast<char*
>(buf),
size_t(bytes)};
151 while ((n = uin.get(u8)) > 0)
152 ret.append(
reinterpret_cast<char*
>(u8),
size_t(n));
155 throw std::runtime_error{
"UTF-8 conversion error " + std::to_string(n)};
164 m_Src(std::move(readc)),
176 m_Src([&]()->std::optional<char> {
178 if (
static_cast<bool>(in.get(ch)))
219 (this->*m_ReadMethod)();
244 dst[0] =
T_Utf16((c >>10) |0xD800);
245 dst[1] =
T_Utf16((c &0x3FF) |0xDC00);
256 ret = u32toutf8(c, dst);
261void C_UnicodeIn::ingestMBCS()
265 if (
auto size = m_Src.size())
268 const auto utf16 = std::make_unique<wchar_t[]>(size);
269 if (
int wn = MultiByteToWideChar(m_CodePage, MB_ERR_INVALID_CHARS, m_Src.buffer(),
int(size), utf16.get(),
int(size)))
271 FC_ReadMem
read({
reinterpret_cast<char*
>(utf16.get()), size_t(wn*2)});
272 C_Source src(std::move(
read));
273 while (readUTF16(src,
false));
279 static constinit const char *
const TO_UCS4 = std::endian::native == std::endian::little?
"UCS-4LE":
"UCS-4BE";
280 static_assert(std::endian::native == std::endian::little || std::endian::native == std::endian::big);
281 for (
T_Encoding i = m_CodePage; *i && m_iconv == (iconv_t)(-1); ++i)
282 m_iconv = iconv_open(TO_UCS4, *i);
284 if (m_iconv == (iconv_t)(-1))
290 const auto ucs4 = std::make_unique<T_Utf32[]>(size);
291 size_t size_ucs4 = size * 4;
292 auto src =
const_cast<char*
>(m_Src.buffer());
293 auto dst =
reinterpret_cast<char*
>(ucs4.get());
294 if (
size_t(-1) != iconv(m_iconv, &src, &size, &dst, &size_ucs4))
297 for (
const T_Utf32 *i = ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.push(*i++));
298 m_Src.pop(m_Src.size());
306 for (
const T_Utf32 *i = ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.push(*i++));
307 m_Src.pop(m_Src.size()-size);
317void C_UnicodeIn::init()
321 switch (m_Src.size())
324 switch (m_Src.getUtf32(0,
false))
328 m_ReadMethod = &C_UnicodeIn::readUTF32;
332 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
338 switch (m_Src.getUtf16(0,
false))
342 m_ReadMethod = &C_UnicodeIn::readUTF16;
346 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
349 if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8
"\uFEFF", 3))
353 setCodePage(CHSETS_UTF8);
354 m_ReadMethod = &C_UnicodeIn::readCodePage;
362 const auto size = m_Src.size();
365 if (size && size % 4 == 0)
367 size_t n_u32_chars = 0, n_u32rev_chars = 0;
368 const auto p_dwords =
reinterpret_cast<const T_Utf32*
>(m_Src.buffer());
369 const size_t n = size / 4;
370 for (
size_t i = 0; i < n; ++i)
372 if (
auto u32 = p_dwords[i])
374 bool matched =
false;
375 if (u32 == (u32 & 0xFFFFFF))
381 u32 = std::byteswap(u32);
382 if (u32 == (u32 & 0xFFFFFF))
392 const auto n_overflow = n_u32_chars + n_u32rev_chars - n;
393 if (n_u32_chars <= n_overflow)
395 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
398 if (n_u32rev_chars <= n_overflow)
400 m_ReadMethod = &C_UnicodeIn::readUTF32;
407 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
408 if (IsTextUnicode(m_Src.buffer(),
int(size), &mask) || mask)
410 m_ReadMethod = &C_UnicodeIn::readUTF16;
413 mask = IS_TEXT_UNICODE_REVERSE_MASK;
414 if (IsTextUnicode(m_Src.buffer(),
int(size), &mask) || mask)
416 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
426 m_ReadMethod = &C_UnicodeIn::readASCII;
430 if (*m_Src.buffer() &0x80)
433 m_GetQ.push(
T_Utf8(*m_Src.buffer()));
442void C_UnicodeIn::readASCII()
447 const auto c =
T_Utf8(*m_Src.buffer());
461 m_Src.readTillCtrl();
468 m_ReadMethod = &C_UnicodeIn::readCodePage;
475bool C_UnicodeIn::guessCodePage()
477 if (m_Src.size() < 2) [[unlikely]]
480 static constinit const T_Encoding MBCS_CODEPAGES[] ={
483 932, 936, 949, 950, 951,
486 CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
489 for (
auto i: MBCS_CODEPAGES)
496 m_ReadMethod = &C_UnicodeIn::readCodePage;
503bool C_UnicodeIn::readUTF16(C_Source &src,
bool reverseWord)
507 const size_t read = src.size();
510 const auto uc = src.getUtf16(0,reverseWord);
511 if (0xD800 <= uc && uc < 0xDC00)
517 const T_Utf16 uc2 = src.getUtf16(1,reverseWord);
518 if (0xDC00 <= uc2 && uc2 < 0xE000)
522 m_GetQ.push(
T_Utf32((((uc&0x3FF)<<10)|(uc2&0x3FF))+0x10000));
532 else if (0xDC00 <= uc && uc < 0xE000)
548void C_UnicodeIn::readUTF16()
550 readUTF16(m_Src,
false);
553void C_UnicodeIn::readReverseUTF16()
555 readUTF16(m_Src,
true);
558bool C_UnicodeIn::readUTF32(C_Source &src,
bool reverseWord)
562 const size_t read = src.size();
565 m_GetQ.push(src.getUtf32(0,reverseWord));
575void C_UnicodeIn::readUTF32()
577 readUTF32(m_Src,
false);
580void C_UnicodeIn::readReverseUTF32()
582 readUTF32(m_Src,
true);
585void C_UnicodeIn::readCodePage()
587 m_Src.readTillCtrl();
600void C_UnicodeIn::reset_iconv()
602 if (m_iconv != (iconv_t)(-1))
604 iconv_close(m_iconv);
605 m_iconv = (iconv_t)(-1);
610C_UnicodeIn::C_Source::C_Source(
FH_ReadChar &&readc)
noexcept:
611 m_ReadCh(std::move(readc)),
616const char *C_UnicodeIn::C_Source::buffer() const noexcept
618 return m_ReadBuf.data() + m_AvailBeg;
621T_Utf16 C_UnicodeIn::C_Source::getUtf16(
size_t pos,
bool reverseWord)
const
623 const size_t off = m_AvailBeg + pos * 2;
624 if (off + 2 > m_ReadBuf.size())
625 throw std::runtime_error{
"End of char " + std::to_string(off+2) +
" passes end of buffer"};
627 auto ret = *
reinterpret_cast<const T_Utf16*
>(m_ReadBuf.data() + off);
628 return reverseWord? std::byteswap(ret): ret;
631T_Utf32 C_UnicodeIn::C_Source::getUtf32(
size_t pos,
bool reverseWord)
const
633 const size_t off = m_AvailBeg + pos * 4;
634 if (off + 4 > m_ReadBuf.size())
635 throw std::runtime_error{
"End of char " + std::to_string(off+4) +
" passes end of buffer"};
637 auto ret = *
reinterpret_cast<const T_Utf32*
>(m_ReadBuf.data() + off);
638 return reverseWord? std::byteswap(ret): ret;
641void C_UnicodeIn::C_Source::pop(
size_t bytes)
644 if (m_AvailBeg > m_ReadBuf.size())
647 throw std::runtime_error{
"m_AvailBeg overflow"};
651void C_UnicodeIn::C_Source::read(
size_t bytes)
653 if (m_AvailBeg + bytes > m_ReadBuf.size())
657 m_ReadBuf.erase(0, m_AvailBeg);
660 bytes -= m_ReadBuf.size();
661 for (
size_t i = 0; i < bytes; ++i)
662 if (
auto c = m_ReadCh())
669void C_UnicodeIn::C_Source::readTillCtrl()
671 if (m_AvailBeg == m_ReadBuf.size())
677 while (
auto c = m_ReadCh())
686size_t C_UnicodeIn::C_Source::size() const noexcept
688 return m_ReadBuf.size() - m_AvailBeg;
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
THE common namespace of bux library.
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
std::function< std::optional< char >()> FH_ReadChar
void read(const std::string &src, size_t &off, T &data) noexcept
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
@ UIE_NO_UNICODE_TRANSLATION
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.