10#pragma comment(lib, "Advapi32.lib")
26 FC_ReadMem(std::string_view sv)
noexcept: m_Src(sv.data()), m_End(sv.data()+sv.size()) {}
27 std::optional<char> operator()() noexcept
38 const char *m_Src, *
const m_End;
51constinit const char *
const CHSETS_SJIS[] = {
"CP932",
"EUC-JP",
"SHIFT_JIS",
"SHIFT-JIS",
"SJIS", 0};
52constinit const char *
const CHSETS_GB[] = {
"CP936",
"EUC-CN",
"GB18030",
"GBK", 0};
53constinit const char *
const CHSETS_KSC[] = {
"CP949",
"EUC-KR",
"JOHAB", 0};
54constinit const char *
const CHSETS_BIG5[] = {
"CP950",
"EUC-TW",
"BIG5-HKSCS",
"BIG5HKSCS",
"BIG-5",
"BIG5", 0};
55constinit const char *
const CHSETS_UTF8[] = {
"UTF-8",
"UTF8", 0};
56constinit const char *
const CHSETS_UTF7[] = {
"UTF-7",
"UTF7", 0};
57constinit const char *
const CHSETS_UTF16LE[] = {
"UTF-16LE",
"UTF16LE",
"UCS-2LE",
"USC2LE", 0};
58constinit const char *
const CHSETS_UTF16BE[] = {
"UTF-16BE",
"UTF16BE",
"UCS-2BE",
"USC2BE", 0};
89 else if (c < 0x200000)
95 else if (c < 0x4000000)
101 else if (c < 0x80000000)
112 for (
int i =ret; --i > 0;)
114 dst[i] =
T_Utf8((c &0x3F) | 0x80);
117 dst[0] =
T_Utf8(c |((
const unsigned char*)
"\xC0\xE0\xF0\xF8\xFC")[ret-2]);
136 const auto bytes = u32toutf8(uc, buf);
140 return {
reinterpret_cast<char*
>(buf),
size_t(bytes)};
148 while ((n = uin.get(u8)) > 0)
149 ret.append(
reinterpret_cast<char*
>(u8),
size_t(n));
161 m_Src(std::move(readc)),
173 m_Src([&]()->std::optional<char> {
175 if (
static_cast<bool>(in.get(ch)))
216 (this->*m_ReadMethod)();
241 dst[0] =
T_Utf16((c >>10) |0xD800);
242 dst[1] =
T_Utf16((c &0x3FF) |0xDC00);
253 ret = u32toutf8(c, dst);
258void C_UnicodeIn::ingestMBCS()
262 if (
auto size = m_Src.size())
265 const auto utf16 = std::make_unique<wchar_t[]>(size);
266 if (
int wn = MultiByteToWideChar(m_CodePage, MB_ERR_INVALID_CHARS, m_Src.buffer(),
int(size), utf16.get(),
int(size)))
268 FC_ReadMem
read({
reinterpret_cast<char*
>(utf16.get()), size_t(wn*2)});
269 C_Source src(std::move(
read));
270 while (readUTF16(src,
false));
276 static constinit const char *
const TO_UCS4 = std::endian::native == std::endian::little?
"UCS-4LE":
"UCS-4BE";
277 static_assert(std::endian::native == std::endian::little || std::endian::native == std::endian::big);
278 for (
T_Encoding i = m_CodePage; *i && m_iconv == (iconv_t)(-1); ++i)
279 m_iconv = iconv_open(TO_UCS4, *i);
281 if (m_iconv == (iconv_t)(-1))
287 const auto ucs4 = std::make_unique<T_Utf32[]>(size);
288 size_t size_ucs4 = size * 4;
289 auto src =
const_cast<char*
>(m_Src.buffer());
290 auto dst =
reinterpret_cast<char*
>(ucs4.get());
291 if (
size_t(-1) != iconv(m_iconv, &src, &size, &dst, &size_ucs4))
294 for (
const T_Utf32 *i = ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.push(*i++));
295 m_Src.pop(m_Src.size());
303 for (
const T_Utf32 *i = ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.push(*i++));
304 m_Src.pop(m_Src.size()-size);
314void C_UnicodeIn::init()
318 switch (m_Src.size())
321 switch (m_Src.getUtf32(0,
false))
325 m_ReadMethod = &C_UnicodeIn::readUTF32;
329 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
335 switch (m_Src.getUtf16(0,
false))
339 m_ReadMethod = &C_UnicodeIn::readUTF16;
343 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
346 if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(), u8
"\uFEFF", 3))
350 setCodePage(CHSETS_UTF8);
351 m_ReadMethod = &C_UnicodeIn::readCodePage;
359 const auto size = m_Src.size();
362 if (size && size % 4 == 0)
364 size_t n_u32_chars = 0, n_u32rev_chars = 0;
365 const auto p_dwords =
reinterpret_cast<const T_Utf32*
>(m_Src.buffer());
366 const size_t n = size / 4;
367 for (
size_t i = 0; i < n; ++i)
369 if (
auto u32 = p_dwords[i])
371 bool matched =
false;
372 if (u32 == (u32 & 0xFFFFFF))
378 u32 = std::byteswap(u32);
379 if (u32 == (u32 & 0xFFFFFF))
389 const auto n_overflow = n_u32_chars + n_u32rev_chars - n;
390 if (n_u32_chars <= n_overflow)
392 m_ReadMethod = &C_UnicodeIn::readReverseUTF32;
395 if (n_u32rev_chars <= n_overflow)
397 m_ReadMethod = &C_UnicodeIn::readUTF32;
404 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
405 if (IsTextUnicode(m_Src.buffer(),
int(size), &mask) || mask)
407 m_ReadMethod = &C_UnicodeIn::readUTF16;
410 mask = IS_TEXT_UNICODE_REVERSE_MASK;
411 if (IsTextUnicode(m_Src.buffer(),
int(size), &mask) || mask)
413 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
423 m_ReadMethod = &C_UnicodeIn::readASCII;
427 if (*m_Src.buffer() &0x80)
430 m_GetQ.push(
T_Utf8(*m_Src.buffer()));
439void C_UnicodeIn::readASCII()
444 const auto c =
T_Utf8(*m_Src.buffer());
458 m_Src.readTillCtrl();
465 m_ReadMethod = &C_UnicodeIn::readCodePage;
472bool C_UnicodeIn::guessCodePage()
474 if (m_Src.size() < 2) [[unlikely]]
477 static constinit const T_Encoding MBCS_CODEPAGES[] ={
480 932, 936, 949, 950, 951,
483 CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
486 for (
auto i: MBCS_CODEPAGES)
493 m_ReadMethod = &C_UnicodeIn::readCodePage;
500bool C_UnicodeIn::readUTF16(C_Source &src,
bool reverseWord)
504 const size_t read = src.size();
507 const auto uc = src.getUtf16(0,reverseWord);
508 if (0xD800 <= uc && uc < 0xDC00)
514 const T_Utf16 uc2 = src.getUtf16(1,reverseWord);
515 if (0xDC00 <= uc2 && uc2 < 0xE000)
519 m_GetQ.push(
T_Utf32((((uc&0x3FF)<<10)|(uc2&0x3FF))+0x10000));
529 else if (0xDC00 <= uc && uc < 0xE000)
545void C_UnicodeIn::readUTF16()
547 readUTF16(m_Src,
false);
550void C_UnicodeIn::readReverseUTF16()
552 readUTF16(m_Src,
true);
555bool C_UnicodeIn::readUTF32(C_Source &src,
bool reverseWord)
559 const size_t read = src.size();
562 m_GetQ.push(src.getUtf32(0,reverseWord));
572void C_UnicodeIn::readUTF32()
574 readUTF32(m_Src,
false);
577void C_UnicodeIn::readReverseUTF32()
579 readUTF32(m_Src,
true);
582void C_UnicodeIn::readCodePage()
584 m_Src.readTillCtrl();
597void C_UnicodeIn::reset_iconv()
599 if (m_iconv != (iconv_t)(-1))
601 iconv_close(m_iconv);
602 m_iconv = (iconv_t)(-1);
607C_UnicodeIn::C_Source::C_Source(
FH_ReadChar &&readc)
noexcept:
608 m_ReadCh(std::move(readc)),
613const char *C_UnicodeIn::C_Source::buffer() const noexcept
615 return m_ReadBuf.data() + m_AvailBeg;
618T_Utf16 C_UnicodeIn::C_Source::getUtf16(
size_t pos,
bool reverseWord)
const
620 const size_t off = m_AvailBeg + pos * 2;
621 if (off + 2 > m_ReadBuf.size())
624 auto ret = *
reinterpret_cast<const T_Utf16*
>(m_ReadBuf.data() + off);
625 return reverseWord? std::byteswap(ret): ret;
628T_Utf32 C_UnicodeIn::C_Source::getUtf32(
size_t pos,
bool reverseWord)
const
630 const size_t off = m_AvailBeg + pos * 4;
631 if (off + 4 > m_ReadBuf.size())
634 auto ret = *
reinterpret_cast<const T_Utf32*
>(m_ReadBuf.data() + off);
635 return reverseWord? std::byteswap(ret): ret;
638void C_UnicodeIn::C_Source::pop(
size_t bytes)
641 if (m_AvailBeg > m_ReadBuf.size())
648void C_UnicodeIn::C_Source::read(
size_t bytes)
650 if (m_AvailBeg + bytes > m_ReadBuf.size())
654 m_ReadBuf.erase(0, m_AvailBeg);
657 bytes -= m_ReadBuf.size();
658 for (
size_t i = 0; i < bytes; ++i)
659 if (
auto c = m_ReadCh())
666void C_UnicodeIn::C_Source::readTillCtrl()
668 if (m_AvailBeg == m_ReadBuf.size())
674 while (
auto c = m_ReadCh())
683size_t C_UnicodeIn::C_Source::size() const noexcept
685 return m_ReadBuf.size() - m_AvailBeg;
#define RUNTIME_ERROR(fmtStr,...)
Wrap FILE(DATE)#__LINE__ FUNCTION: msg into std::runtime_error.
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
THE common namespace of bux library.
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
std::function< std::optional< char >()> FH_ReadChar
void read(const std::string &src, size_t &off, T &data) noexcept
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
@ UIE_NO_UNICODE_TRANSLATION
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.