bux API Reference 1.12.3
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.h
Go to the documentation of this file.
1#pragma once
2
3#include "XQue.h" // bux::C_Queue<>
4#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t
5#include <functional> // std::function<>
6#include <iosfwd> // Forwarded std::istream
7#include <optional> // std::optional<>
8#include <string> // std::string
9#include <string_view> // std::string_view
10#include <vector> // std::vector<>
11
12#ifndef _WIN32
13#include <iconv.h> // iconv_t
14#endif
15
16namespace bux {
17
18//
19// Constants
20//
21enum
22{
24 MAX_UTF8 = 6, // UTF-8 encoding limit (31 bits UCS-4)
25 MAX_UTF8_BMP = 3, // Unicode Plane 0: Basic Multilingual Plane
26 MAX_UTF8_VALID = 4, // Unicode Plane 0~16
27
28 // Error for C_UnicodeIn::get()
34};
35
36//
37// Types
38//
39typedef std::uint32_t T_Utf32;
40typedef std::uint16_t T_Utf16;
41typedef std::uint8_t T_Utf8;
42
43static_assert(sizeof(T_Utf32) == 4);
44static_assert(sizeof(T_Utf16) == 2);
45static_assert(sizeof(T_Utf8) == 1);
46
47typedef std::function<std::optional<char>()> FH_ReadChar;
48
49#ifdef _WIN32
50typedef unsigned T_Encoding;
51#else
52typedef const char *const *T_Encoding; // null-terminated const array of const char pointers
53#endif
54
56{
57public:
58
59 // Ctor/Dtor
60 C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage =0);
61 C_UnicodeIn(std::string_view sv, T_Encoding codepage =0);
62 C_UnicodeIn(std::string &&s, T_Encoding codepage =0) = delete;
63 C_UnicodeIn(const char *s, T_Encoding codepage =0): C_UnicodeIn(std::string_view(s), codepage) {}
64 C_UnicodeIn(std::istream &in, T_Encoding codepage =0);
65 ~C_UnicodeIn() noexcept;
66
67 // Nonvirtuals
68 int get(T_Utf32 &c);
69 int get(T_Utf16 *dst);
70 int get(T_Utf8 *dst);
71 int lastError() const noexcept { return m_GetQ.empty()? m_ErrCode: 1; }
72 T_Encoding encoding() const noexcept { return m_CodePage; }
73
74private:
75
76 // Types
77 class C_Source
78 {
79 public:
80
81 // Nonvirtuals
82 C_Source(FH_ReadChar &&readc) noexcept;
83 const char *buffer() const noexcept;
84 T_Utf16 getUtf16(size_t pos, bool reverseWord) const;
85 T_Utf32 getUtf32(size_t pos, bool reverseWord) const;
86 void pop(size_t bytes);
87 void read(size_t bytes);
88 void readTillCtrl();
89 size_t size() const noexcept;
90
91 private:
92
93 // Data
94 FH_ReadChar m_ReadCh;
95 std::string m_ReadBuf;
96 size_t m_AvailBeg;
97 };
98
99 // Data
100 C_Source m_Src;
101 C_Queue<T_Utf32> m_GetQ;
102 void (C_UnicodeIn::*m_ReadMethod)(){};
103 T_Encoding m_CodePage;
104#ifndef _WIN32
105 iconv_t m_iconv{(iconv_t)-1}; // changed according to m_CodePage
106#endif
107 int m_ErrCode{UIE_EOF};
108
109 // Nonvirtuals
110 bool guessCodePage();
111 void ingestMBCS();
112 void init();
113 void readCodePage();
114 void readASCII();
115 void readReverseUTF16();
116 void readReverseUTF32();
117 void readUTF16();
118 bool readUTF16(C_Source &src, bool reverseWord);
119 void readUTF32();
120 bool readUTF32(C_Source &src, bool reverseWord);
121 void setCodePage(T_Encoding cp);
122#ifndef _WIN32
123 void reset_iconv();
124#endif
125};
126
127//
128// Externs
129//
130extern const T_Encoding ENCODING_UTF8;
131
132std::string_view to_utf8(T_Utf32 c);
133std::string to_utf8(C_UnicodeIn &&uin);
134
135template<typename T>
136std::string to_utf8(const T *ps, size_t size = 0, T_Encoding codepage = 0)
137{
138 if (!size)
139 size = std::char_traits<T>::length(ps);
140
141 std::string_view view_as_chars{reinterpret_cast<const char*>(ps), size*sizeof(T)};
142 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
143}
144template<typename T>
145std::string to_utf8(const std::basic_string<T> &s, T_Encoding codepage = 0)
146{
147 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
148 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
149}
150template<typename T>
151std::string to_utf8(std::basic_string_view<T> s, T_Encoding codepage = 0)
152{
153 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
154 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
155}
156
157template<typename T>
158std::basic_string<T> BOM(const std::basic_string<T> &s)
159{
160 if constexpr (sizeof(T) > 1)
161 return T(0xFEFF) + s;
162 else
163 return (const T*)u8"\uFEFF" + s;
164}
165template<typename T>
166std::basic_string<T> BOM(std::basic_string_view<T> sv)
167{
168 if constexpr (sizeof(T) > 1)
169 return T(0xFEFF) + std::basic_string<T>(sv);
170 else
171 return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);
172}
173template<typename T>
174std::basic_string<T> BOM(const T *p)
175{
176 if constexpr (sizeof(T) > 1)
177 return T(0xFEFF) + std::basic_string<T>(p);
178 else
179 return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;
180}
181
182} // namespace bux
183
184using bux::T_Utf32;
185using bux::T_Utf16;
186using bux::T_Utf8;
187
188static_assert(sizeof(T_Utf32) == 4);
189static_assert(sizeof(T_Utf16) == 2);
190static_assert(sizeof(T_Utf8) == 1);
C_UnicodeIn(std::string &&s, T_Encoding codepage=0)=delete
int get(T_Utf32 &c)
C_UnicodeIn(const char *s, T_Encoding codepage=0)
Definition UnicodeCvt.h:63
T_Encoding encoding() const noexcept
Definition UnicodeCvt.h:72
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:71
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
Definition UnicodeCvt.h:52
std::basic_string< T > BOM(const std::basic_string< T > &s)
Definition UnicodeCvt.h:158
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:47
void read(const std::string &src, size_t &off, T &data) noexcept
Definition Serialize.h:35
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:40
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:41
@ MAX_UTF8_VALID
Definition UnicodeCvt.h:26
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ MAX_UTF16
Definition UnicodeCvt.h:23
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_INTERNAL
Definition UnicodeCvt.h:33
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
@ MAX_UTF8_BMP
Definition UnicodeCvt.h:25
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:39