9#pragma comment(lib, "Advapi32.lib")
12#elif defined(__unix__)
27 FC_ReadMem(std::string_view sv)
noexcept: m_Src(sv.data()), m_End(sv.data()+sv.size()) {}
28 std::optional<char> operator()() noexcept
39 const char *m_Src, *
const m_End;
50#elif defined(__unix__)
51const char *
const CHSETS_UTF8[] ={
"UTF-8",
"UTF8", 0};
58bool testUtf16(
const char *src_,
size_t bytes)
64 size_t wordZeros =0, hiByteZeros =0;
65 for (
size_t i =0; i < bytes; i +=2)
67 const char *t =src_ +i;
68 if (!*
reinterpret_cast<const int16_t*
>(t))
73 if (!hiByteZeros || wordZeros)
77 static const char *
const CHSETS_UTF16[] ={
"UCS-2",
"UTF-16",
"USC2",
"UTF16", 0};
78 iconv_t cd =(iconv_t)(-1);
79 for (
const char *
const *i =CHSETS_UTF16; *i && cd == (iconv_t)(-1); ++i)
80 cd =iconv_open(
"UCS-4", *i);
82 if (cd == (iconv_t)(-1))
86 const auto ucs4 = std::make_unique<T_Utf32[]>(bytes);
87 size_t size_ucs4 =bytes*4;
88 char *src =
const_cast<char*
>(src_);
89 char *dst =
reinterpret_cast<char*
>(ucs4.get());
90 bool ret = size_t(-1) != iconv(cd, &src, &bytes, &dst, &size_ucs4) ||
112 else if (c < 0x10000)
118 else if (c < 0x200000)
124 else if (c < 0x4000000)
130 else if (c < 0x80000000)
141 for (
int i =ret; --i > 0;)
143 dst[i] =
T_Utf8((c &0x3F) | 0x80);
146 dst[0] =
T_Utf8(c |((
const unsigned char*)
"\xC0\xE0\xF0\xF8\xFC")[ret-2]);
160 const auto bytes = u32toutf8(uc, buf);
164 return {
reinterpret_cast<char*
>(buf),
size_t(bytes)};
167std::string
to_utf8(std::string_view s, T_Encoding codepage)
172std::string
to_utf8(std::istream &in, T_Encoding codepage)
193 while ((n = cvt.get(u8)) > 0)
194 ret.append(
reinterpret_cast<char*
>(u8),
size_t(n));
202std::wstring
BOM(
const std::wstring &ws)
211 m_Src(std::move(readc)),
223 m_Src([&]()->std::optional<char> {
225 if (
static_cast<bool>(in.get(ch)))
266 (this->*m_ReadMethod)();
291 dst[0] =
T_Utf16((c >>10) |0xD800);
292 dst[1] =
T_Utf16((c &0x3FF) |0xDC00);
303 ret = u32toutf8(c, dst);
308void C_UnicodeIn::ingestMBCS()
312 if (
auto size = m_Src.size())
315 const auto utf16 = std::make_unique<wchar_t[]>(size);
316 if (
int wn = MultiByteToWideChar(m_CodePage, MB_ERR_INVALID_CHARS, m_Src.buffer(),
int(size), utf16.get(),
int(size)))
318 FC_ReadMem
read({
reinterpret_cast<char*
>(utf16.get()), size_t(wn*2)});
319 C_Source src(std::move(
read));
320 while (readUTF16(src,
false));
325#elif defined(__unix__)
326 for (T_Encoding i =m_CodePage; *i && m_iconv == (iconv_t)(-1); ++i)
327 m_iconv =iconv_open(
"UCS-4LE", *i);
329 if (m_iconv == (iconv_t)(-1))
335 const auto ucs4 = std::make_unique<T_Utf32[]>(size);
336 size_t size_ucs4 =size*4;
337 auto src =
const_cast<char*
>(m_Src.buffer());
338 auto dst =
reinterpret_cast<char*
>(ucs4.get());
339 if (
size_t(-1) != iconv(m_iconv, &src, &size, &dst, &size_ucs4))
342 for (
const T_Utf32 *i = ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.
push(le32toh(*i++)));
343 m_Src.pop(m_Src.size());
351 for (
const T_Utf32 *i =ucs4.get(); i <
reinterpret_cast<T_Utf32*
>(dst); m_GetQ.
push(le32toh(*i++)));
352 m_Src.pop(m_Src.size()-size);
362void C_UnicodeIn::init()
365 switch (m_Src.size())
368 switch (m_Src.getUtf16(0))
372 m_ReadMethod = &C_UnicodeIn::readUTF16;
376 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
380 if (m_Src.size() >= 3 && 0 == memcmp(m_Src.buffer(),
"\xef\xbb\xbf", 3))
384 setCodePage(CHSETS_UTF8);
385 m_ReadMethod = &C_UnicodeIn::readCodePage;
393 const auto size = m_Src.size();
395 int mask = IS_TEXT_UNICODE_UNICODE_MASK;
396 if (IsTextUnicode(m_Src.buffer(),
int(size), &mask))
397#elif defined(__unix__)
398 if (testUtf16(m_Src.buffer(), size))
401 m_ReadMethod = &C_UnicodeIn::readUTF16;
404 const auto revBuf = std::make_unique<char[]>(size);
405 swab(
const_cast<char*
>(m_Src.buffer()), revBuf.get(),
int(size));
407 mask = IS_TEXT_UNICODE_UNICODE_MASK;
408 if (IsTextUnicode(revBuf.get(),
int(size), &mask))
409#elif defined(__unix__)
410 if (testUtf16(revBuf.get(), size))
413 m_ReadMethod = &C_UnicodeIn::readReverseUTF16;
419 m_ReadMethod = &C_UnicodeIn::readASCII;
423 if (*m_Src.buffer() &0x80)
435void C_UnicodeIn::readASCII()
440 const auto c =
T_Utf8(*m_Src.buffer());
450 m_Src.readTillCtrl();
457 m_ReadMethod = &C_UnicodeIn::readCodePage;
463 static const char *
const CHSETS_SJIS[] = {
"CP932",
"EUC-JP",
"SHIFT_JIS",
"SHIFT-JIS",
"SJIS", 0};
464 static const char *
const CHSETS_GB[] = {
"CP936",
"EUC-CN",
"GB18030",
"GBK", 0};
465 static const char *
const CHSETS_KSC[] = {
"CP949",
"EUC-KR",
"JOHAB", 0};
466 static const char *
const CHSETS_BIG5[] = {
"CP950",
"EUC-TW",
"BIG5-HKSCS",
"BIG5HKSCS",
"BIG-5",
"BIG5", 0};
467 static const char *
const CHSETS_UTF7[] = {
"UTF-7",
"UTF7", 0};
468 static const char *
const CHSETS_UTF16LE[] = {
"UCS-2LE",
"UTF-16LE",
"USC2LE",
"UTF16LE", 0};
469 static const char *
const CHSETS_UTF16BE[] = {
"UCS-2BE",
"UTF-16BE",
"USC2BE",
"UTF16BE", 0};
471 static const T_Encoding MBCS_CODEPAGES[] ={
474 932, 936, 949, 950, 951,
476#elif defined(__unix__)
477 CHSETS_UTF8, CHSETS_SJIS, CHSETS_GB, CHSETS_KSC, CHSETS_BIG5, CHSETS_UTF7, CHSETS_UTF16LE, CHSETS_UTF16BE
480 for (
size_t i = 0; i < std::size(MBCS_CODEPAGES); ++i)
481 if (testCodePage(MBCS_CODEPAGES[i]))
483 m_ReadMethod = &C_UnicodeIn::readCodePage;
489bool C_UnicodeIn::readUTF16(C_Source &src,
bool reverseWord)
493 const size_t read = src.size();
496 const T_Utf16 uc = src.getUtf16(0,reverseWord);
497 if (0xD800 <= uc && uc < 0xDC00)
503 const T_Utf16 uc2 = src.getUtf16(1,reverseWord);
504 if (0xDC00 <= uc2 && uc2 < 0xE000)
508 m_GetQ.
push(
T_Utf32((((uc&0x3FF)<<10)|(uc2&0x3FF))+0x10000));
518 else if (0xDC00 <= uc && uc < 0xE000)
534void C_UnicodeIn::readUTF16()
536 readUTF16(m_Src,
false);
539void C_UnicodeIn::readReverseUTF16()
541 readUTF16(m_Src,
true);
544void C_UnicodeIn::readCodePage()
546 m_Src.readTillCtrl();
550void C_UnicodeIn::setCodePage(T_Encoding cp)
558bool C_UnicodeIn::testCodePage(T_Encoding cp)
567void C_UnicodeIn::reset_iconv()
569 if (m_iconv != (iconv_t)(-1))
571 iconv_close(m_iconv);
572 m_iconv = (iconv_t)(-1);
577C_UnicodeIn::C_Source::C_Source(
FH_ReadChar &&readc)
noexcept:
578 m_ReadCh(std::move(readc)),
583const char *C_UnicodeIn::C_Source::buffer() const noexcept
585 return m_ReadBuf.data() + m_AvailBeg;
588T_Utf16 C_UnicodeIn::C_Source::getUtf16(
size_t pos,
bool reverseWord)
const
590 const size_t off = m_AvailBeg +pos *2;
591 if (off +2 > m_ReadBuf.size())
594 const auto p = m_ReadBuf.data() + off;
609void C_UnicodeIn::C_Source::pop(
size_t bytes)
612 if (m_AvailBeg > m_ReadBuf.size())
619void C_UnicodeIn::C_Source::read(
size_t bytes)
621 if (m_AvailBeg + bytes > m_ReadBuf.size())
625 m_ReadBuf.erase(0, m_AvailBeg);
628 bytes -= m_ReadBuf.size();
629 for (
size_t i = 0; i < bytes; ++i)
630 if (
auto c = m_ReadCh())
637void C_UnicodeIn::C_Source::readTillCtrl()
639 if (m_AvailBeg == m_ReadBuf.size())
645 while (
auto c = m_ReadCh())
654size_t C_UnicodeIn::C_Source::size() const noexcept
656 return m_ReadBuf.size() - m_AvailBeg;
660 m_u32s (std::move(other.m_u32s)),
661 m_str (std::move(other.m_str)),
662 m_pushCh (other.m_pushCh),
663 m_codepage (other.m_codepage)
669 m_u32s = std::move(other.m_u32s);
670 m_str = std::move(other.m_str);
671 m_pushCh = other.m_pushCh;
672 m_codepage = other.m_codepage;
685 appendNonRaw(s.data(), s.size());
691 srcBytes = strlen(src);
695 m_str.assign(src, srcBytes);
699 m_str.append(src, srcBytes);
701 appendNonRaw(src, srcBytes);
704void C_MBCStr::appendNonRaw(
const char *src,
size_t srcBytes)
const
706 C_UnicodeIn uin(std::string_view{src, srcBytes}, m_codepage);
715void C_MBCStr::appendStr(
T_Utf32 u32)
const
721 const auto rc = u32toutf8(u32, buf);
723 RUNTIME_ERROR(
"MBC string conversion error {} after {} bytes converted", rc, m_str.size());
725 m_str.append(
reinterpret_cast<char*
>(buf),
size_t(rc));
728 m_pushCh(m_str,
static_cast<char>(u32));
733 return m_str.empty() && m_u32s.empty();
738 if (!m_pushCh && !m_str.empty())
745 if (m_pushCh != pushCh)
757 return escape([](std::string &dst,
char c) {
767 static constexpr char JSON_ESCS[] =
"\b\f\n\r\t\"\\";
768 if (
auto p = strchr(JSON_ESCS, c))
771 dst +=
"bfnrt\"\\"[p - JSON_ESCS];
776 dst += std::format(
"\\u{:04}",
int{
static_cast<unsigned char>(c)});
782 return escape([](std::string &dst,
char c) { dst += c; });
#define RUNTIME_ERROR(fmtStr,...)
Wrap FILE(DATE)#__LINE__ FUNCTION: msg into std::runtime_error.
void append(const char *src, size_t srcBytes)
const std::string & escape(F_PushCh pushCh) const
const std::string & escJSON() const
C_MBCStr & operator=(const C_MBCStr &)=delete
void operator+=(std::string_view s)
bool empty() const noexcept
const std::string & strU8() const
C_MBCStr(T_Encoding codepage=0) noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
THE common namespace of bux library.
std::string_view to_utf8(T_Utf32 uc)
std::wstring BOM(const std::wstring &ws)
std::function< std::optional< char >()> FH_ReadChar
void read(const std::string &src, size_t &off, T &data) noexcept
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
@ UIE_NO_UNICODE_TRANSLATION
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.