bux API Reference 1.9.0
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.h
Go to the documentation of this file.
1#pragma once
2
3#include "XQue.h" // bux::C_Queue<>
4#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t
5#include <functional> // std::function<>
6#include <iosfwd> // Forwarded std::istream
7#include <optional> // std::optional<>
8#include <string> // std::string
9#include <string_view> // std::string_view
10#include <vector> // std::vector<>
11
12#ifdef __unix__
13#include <iconv.h> // iconv_t
14#endif
15
16namespace bux {
17
18//
19// Constants
20//
21enum
22{
24 MAX_UTF8 = 6, // UTF-8 encoding limit (31 bits UCS-4)
25 MAX_UTF8_BMP = 3, // Unicode Plane 0: Basic Multilingual Plane
26 MAX_UTF8_VALID = 4, // Unicode Plane 0~16
27
28 // Error for C_UnicodeIn::get()
33 UIE_INTERNAL = -9
34};
35
36//
37// Types
38//
39typedef std::uint32_t T_Utf32;
40typedef std::uint16_t T_Utf16;
41typedef std::uint8_t T_Utf8;
42
43typedef std::function<std::optional<char>()> FH_ReadChar;
44
45#ifdef _WIN32
46typedef unsigned T_Encoding;
47#elif defined(__unix__)
48typedef const char *const *T_Encoding; // null-terminated const array of const char pointers
49#endif
50
52{
53public:
54
55 // Ctor/Dtor
56 C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage =0);
57 C_UnicodeIn(std::string_view sv, T_Encoding codepage =0);
58 C_UnicodeIn(std::string &&s, T_Encoding codepage =0) = delete;
59 C_UnicodeIn(const char *s, T_Encoding codepage =0): C_UnicodeIn(std::string_view(s), codepage) {}
60 C_UnicodeIn(std::istream &in, T_Encoding codepage =0);
61 ~C_UnicodeIn() noexcept;
62
63 // Nonvirtuals
64 int get(T_Utf32 &c);
65 int get(T_Utf16 *dst);
66 int get(T_Utf8 *dst);
67 int lastError() const noexcept { return m_GetQ.empty()? m_ErrCode: 1; }
68 T_Encoding encoding() const noexcept { return m_CodePage; }
69
70private:
71
72 // Types
73 class C_Source
74 {
75 public:
76
77 // Nonvirtuals
78 C_Source(FH_ReadChar &&readc) noexcept;
79 const char *buffer() const noexcept;
80 T_Utf16 getUtf16(size_t pos, bool reverseWord =false) const;
81 void pop(size_t bytes);
82 void read(size_t bytes);
83 void readTillCtrl();
84 size_t size() const noexcept;
85
86 private:
87
88 // Data
89 FH_ReadChar m_ReadCh;
90 std::string m_ReadBuf;
91 size_t m_AvailBeg;
92 };
93
94 // Data
95 C_Source m_Src;
96 C_Queue<T_Utf32> m_GetQ;
97 void (C_UnicodeIn::*m_ReadMethod)(){};
98 T_Encoding m_CodePage;
99#ifdef __unix__
100 iconv_t m_iconv{(iconv_t)-1}; // changed according to m_CodePage
101#endif
102 int m_ErrCode{UIE_EOF};
103
104 // Nonvirtuals
105 void ingestMBCS();
106 void init();
107 void readCodePage();
108 void readASCII();
109 void readReverseUTF16();
110 bool readUTF16(C_Source &src, bool reverseWord);
111 void readUTF16();
112 void readUTF8();
113 void setCodePage(T_Encoding cp);
114 bool testCodePage(T_Encoding cp);
115#ifdef __unix__
116 void reset_iconv();
117#endif
118};
119
121{
122public:
123
124 // Types
125 typedef void (*F_PushCh)(std::string &dst, char c);
126
127 // Nonvirtuals
128 C_MBCStr(T_Encoding codepage = 0) noexcept: m_codepage(codepage) {}
129 C_MBCStr(const C_MBCStr&) = delete;
130 C_MBCStr &operator=(const C_MBCStr&) = delete;
131
132 C_MBCStr(C_MBCStr &&other) noexcept;
133 void operator=(C_MBCStr &&other) noexcept;
134
135 C_MBCStr(std::string_view s, T_Encoding codepage = 0) noexcept: m_str(s), m_codepage(codepage) {}
136 void operator +=(std::string_view s);
137
138 template<typename T> C_MBCStr(const T *ps, size_t size = 0, T_Encoding codepage = 0): m_codepage(codepage)
139 { append(ps, size); }
140 template<typename T> void operator +=(const T *ps)
141 { append(ps, 0); }
142
143 template<typename T> C_MBCStr(std::basic_string_view<T> s, T_Encoding codepage = 0): m_codepage(codepage)
144 { append(s.data(), s.size()); }
145 template<typename T> void operator +=(const std::basic_string<T> &s)
146 { append(s.data(), s.size()); }
147
148 void append(const char *src, size_t srcBytes);
149 template<typename T>
150 void append(const T *ps, size_t size)
151 {
152 if (!size)
153 size = std::char_traits<T>::length(ps);
154
155 append(reinterpret_cast<const char*>(ps), size*sizeof(T));
156 }
157
158 bool empty() const noexcept;
159 const std::string &escape(F_PushCh pushCh) const;
160 const std::string &escJSON() const;
161 const std::string &strU8() const;
162
163private:
164
165 // Data
166 std::vector<T_Utf32> mutable m_u32s;
167 std::string mutable m_str;
168 F_PushCh mutable m_pushCh{};
169 T_Encoding m_codepage{};
170
171 // Nonvirtuals
172 void appendNonRaw(const char *src, size_t srcBytes) const;
173 void appendStr(T_Utf32 u32) const;
174};
175
176//
177// Function Prototypes
178//
179//int u32toutf8(T_Utf32 c, T_Utf8 *dst) noexcept;
180
181std::string_view to_utf8(T_Utf32 c);
182std::string to_utf8(std::string_view s, T_Encoding codepage = 0);
183std::string to_utf8(std::istream &s, T_Encoding codepage = 0);
184template<typename T>
185auto to_utf8(const T *ps, size_t size = 0, T_Encoding codepage = 0) { return C_MBCStr{ps, size, codepage}.strU8(); }
186template<typename T>
187auto to_utf8(std::basic_string_view<T> s, T_Encoding codepage = 0) { return C_MBCStr{s, codepage}.strU8(); }
188
189std::wstring BOM(const std::wstring_view &ws);
190
191} // namespace bux
192
193using bux::T_Utf32;
194using bux::T_Utf16;
195using bux::T_Utf8;
C_MBCStr(const T *ps, size_t size=0, T_Encoding codepage=0)
Definition UnicodeCvt.h:138
C_MBCStr(std::basic_string_view< T > s, T_Encoding codepage=0)
Definition UnicodeCvt.h:143
void append(const T *ps, size_t size)
Definition UnicodeCvt.h:150
C_MBCStr(const C_MBCStr &)=delete
C_MBCStr & operator=(const C_MBCStr &)=delete
const std::string & strU8() const
C_MBCStr(T_Encoding codepage=0) noexcept
Definition UnicodeCvt.h:128
C_MBCStr(std::string_view s, T_Encoding codepage=0) noexcept
Definition UnicodeCvt.h:135
bool empty() const
Definition XQue.h:28
C_UnicodeIn(std::string &&s, T_Encoding codepage=0)=delete
int get(T_Utf32 &c)
C_UnicodeIn(const char *s, T_Encoding codepage=0)
Definition UnicodeCvt.h:59
T_Encoding encoding() const noexcept
Definition UnicodeCvt.h:68
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:67
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
std::wstring BOM(const std::wstring &ws)
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:43
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:40
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:41
@ MAX_UTF8_VALID
Definition UnicodeCvt.h:26
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ MAX_UTF16
Definition UnicodeCvt.h:23
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_INTERNAL
Definition UnicodeCvt.h:33
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
@ MAX_UTF8_BMP
Definition UnicodeCvt.h:25
void append(const T &src, std::string &dst)
Definition Serialize.h:14
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:39