bux API Reference 1.12.5
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.h
Go to the documentation of this file.
1#pragma once
2
3#include "XQue.h" // bux::C_Queue<>
4#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t
5#include <functional> // std::function<>
6#include <iosfwd> // Forwarded std::istream
7#include <optional> // std::optional<>
8#include <string> // std::string
9#include <string_view> // std::string_view
10#include <vector> // std::vector<>
11
12#ifndef _WIN32
13#include <iconv.h> // iconv_t
14#endif
15
16namespace bux {
17
18//
19// Constants
20//
21enum
22{
24 MAX_UTF8 = 6, // UTF-8 encoding limit (31 bits UCS-4)
25 MAX_UTF8_BMP = 3, // Unicode Plane 0: Basic Multilingual Plane
26 MAX_UTF8_VALID = 4, // Unicode Plane 0~16
27
28 // Error for C_UnicodeIn::get()
35};
36
37//
38// Types
39//
40typedef std::uint32_t T_Utf32;
41typedef std::uint16_t T_Utf16;
42typedef std::uint8_t T_Utf8;
43
44static_assert(sizeof(T_Utf32) == 4);
45static_assert(sizeof(T_Utf16) == 2);
46static_assert(sizeof(T_Utf8) == 1);
47
48typedef std::function<std::optional<char>()> FH_ReadChar;
49
50#ifdef _WIN32
51typedef unsigned T_Encoding;
52#else
53typedef const char *const *T_Encoding; // null-terminated const array of const char pointers
54#endif
55
57{
58public:
59
60 // Ctor/Dtor
61 C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage =0);
62 C_UnicodeIn(std::string_view sv, T_Encoding codepage =0);
63 C_UnicodeIn(std::string &&s, T_Encoding codepage =0) = delete;
64 C_UnicodeIn(const char *s, T_Encoding codepage =0): C_UnicodeIn(std::string_view(s), codepage) {}
65 C_UnicodeIn(std::istream &in, T_Encoding codepage =0);
66 ~C_UnicodeIn() noexcept;
67
68 // Nonvirtuals
69 int get(T_Utf32 &c);
70 int get(T_Utf16 *dst);
71 int get(T_Utf8 *dst);
72 int lastError() const noexcept { return m_GetQ.empty()? m_ErrCode: 1; }
73 T_Encoding encoding() const noexcept { return m_CodePage; }
74 bool withBOM() const { return m_BOMed; }
75
76private:
77
78 // Types
79 class C_Source
80 {
81 public:
82
83 // Nonvirtuals
84 C_Source(FH_ReadChar &&readc) noexcept;
85 const char *buffer() const noexcept;
86 T_Utf16 getUtf16(size_t pos, bool reverseWord) const;
87 T_Utf32 getUtf32(size_t pos, bool reverseWord) const;
88 void pop(size_t bytes);
89 void read(size_t bytes);
90 void readTillCtrl();
91 size_t size() const noexcept;
92
93 private:
94
95 // Data
96 FH_ReadChar m_ReadCh;
97 std::string m_ReadBuf;
98 size_t m_AvailBeg;
99 };
100
101 // Data
102 C_Source m_Src;
103 C_Queue<T_Utf32> m_GetQ;
104 void (C_UnicodeIn::*m_ReadMethod)(){};
105 T_Encoding m_CodePage;
106#ifndef _WIN32
107 iconv_t m_iconv{(iconv_t)-1}; // changed according to m_CodePage
108#endif
109 int m_ErrCode{UIE_EOF};
110 bool m_BOMed = false;
111
112 // Nonvirtuals
113 bool guessCodePage();
114 void ingestMBCS();
115 void init();
116 void readCodePage();
117 void readASCII();
118 void readReverseUTF16();
119 void readReverseUTF32();
120 void readUTF16();
121 bool readUTF16(C_Source &src, bool reverseWord);
122 void readUTF32();
123 bool readUTF32(C_Source &src, bool reverseWord);
124 void setCodePage(T_Encoding cp);
125#ifndef _WIN32
126 void reset_iconv();
127#endif
128};
129
130//
131// Externs
132//
133extern const T_Encoding ENCODING_UTF8;
134
135std::string_view to_utf8(T_Utf32 c);
136std::string to_utf8(C_UnicodeIn &&uin);
137
138template<typename T>
139std::string to_utf8(const T *ps, size_t size = 0, T_Encoding codepage = 0)
140{
141 if (!size)
142 size = std::char_traits<T>::length(ps);
143
144 std::string_view view_as_chars{reinterpret_cast<const char*>(ps), size*sizeof(T)};
145 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
146}
147template<typename T>
148std::string to_utf8(const std::basic_string<T> &s, T_Encoding codepage = 0)
149{
150 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
151 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
152}
153template<typename T>
154std::string to_utf8(std::basic_string_view<T> s, T_Encoding codepage = 0)
155{
156 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
157 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
158}
159
160template<typename T>
161std::basic_string<T> BOM(const std::basic_string<T> &s)
162{
163 if constexpr (sizeof(T) > 1)
164 return T(0xFEFF) + s;
165 else
166 return (const T*)u8"\uFEFF" + s;
167}
168template<typename T>
169std::basic_string<T> BOM(std::basic_string_view<T> sv)
170{
171 if constexpr (sizeof(T) > 1)
172 return T(0xFEFF) + std::basic_string<T>(sv);
173 else
174 return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);
175}
176template<typename T>
177std::basic_string<T> BOM(const T *p)
178{
179 if constexpr (sizeof(T) > 1)
180 return T(0xFEFF) + std::basic_string<T>(p);
181 else
182 return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;
183}
184
185} // namespace bux
186
187using bux::T_Utf32;
188using bux::T_Utf16;
189using bux::T_Utf8;
190
191static_assert(sizeof(T_Utf32) == 4);
192static_assert(sizeof(T_Utf16) == 2);
193static_assert(sizeof(T_Utf8) == 1);
C_UnicodeIn(std::string &&s, T_Encoding codepage=0)=delete
int get(T_Utf32 &c)
bool withBOM() const
Definition UnicodeCvt.h:74
C_UnicodeIn(const char *s, T_Encoding codepage=0)
Definition UnicodeCvt.h:64
T_Encoding encoding() const noexcept
Definition UnicodeCvt.h:73
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:72
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
Definition UnicodeCvt.h:53
std::basic_string< T > BOM(const std::basic_string< T > &s)
Definition UnicodeCvt.h:161
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:48
void read(const std::string &src, size_t &off, T &data) noexcept
Definition Serialize.h:35
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:41
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:42
@ MAX_UTF8_VALID
Definition UnicodeCvt.h:26
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ MAX_UTF16
Definition UnicodeCvt.h:23
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_ICONV_FAIL
Definition UnicodeCvt.h:33
@ UIE_INTERNAL
Definition UnicodeCvt.h:34
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
@ MAX_UTF8_BMP
Definition UnicodeCvt.h:25
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:40