bux API Reference 1.11.0
Static library of whatever are seen required in general purpose but not directly supported from Modern C++. Or whatever reusable originated from my side projects.
Loading...
Searching...
No Matches
UnicodeCvt.h
Go to the documentation of this file.
1#pragma once
2
3#include "XQue.h" // bux::C_Queue<>
4#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t
5#include <functional> // std::function<>
6#include <iosfwd> // Forwarded std::istream
7#include <optional> // std::optional<>
8#include <string> // std::string
9#include <string_view> // std::string_view
10#include <vector> // std::vector<>
11
12#ifndef _WIN32
13#include <iconv.h> // iconv_t
14#endif
15
16namespace bux {
17
18//
19// Constants
20//
21enum
22{
24 MAX_UTF8 = 6, // UTF-8 encoding limit (31 bits UCS-4)
25 MAX_UTF8_BMP = 3, // Unicode Plane 0: Basic Multilingual Plane
26 MAX_UTF8_VALID = 4, // Unicode Plane 0~16
27
28 // Error for C_UnicodeIn::get()
34};
35
36//
37// Types
38//
39typedef std::uint32_t T_Utf32;
40typedef std::uint16_t T_Utf16;
41typedef std::uint8_t T_Utf8;
42
43typedef std::function<std::optional<char>()> FH_ReadChar;
44
45#ifdef _WIN32
46typedef unsigned T_Encoding;
47#else
48typedef const char *const *T_Encoding; // null-terminated const array of const char pointers
49#endif
50
52{
53public:
54
55 // Ctor/Dtor
56 C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage =0);
57 C_UnicodeIn(std::string_view sv, T_Encoding codepage =0);
58 C_UnicodeIn(std::string &&s, T_Encoding codepage =0) = delete;
59 C_UnicodeIn(const char *s, T_Encoding codepage =0): C_UnicodeIn(std::string_view(s), codepage) {}
60 C_UnicodeIn(std::istream &in, T_Encoding codepage =0);
61 ~C_UnicodeIn() noexcept;
62
63 // Nonvirtuals
64 int get(T_Utf32 &c);
65 int get(T_Utf16 *dst);
66 int get(T_Utf8 *dst);
67 int lastError() const noexcept { return m_GetQ.empty()? m_ErrCode: 1; }
68 T_Encoding encoding() const noexcept { return m_CodePage; }
69
70private:
71
72 // Types
73 class C_Source
74 {
75 public:
76
77 // Nonvirtuals
78 C_Source(FH_ReadChar &&readc) noexcept;
79 const char *buffer() const noexcept;
80 T_Utf16 getUtf16(size_t pos, bool reverseWord) const;
81 T_Utf32 getUtf32(size_t pos, bool reverseWord) const;
82 void pop(size_t bytes);
83 void read(size_t bytes);
84 void readTillCtrl();
85 size_t size() const noexcept;
86
87 private:
88
89 // Data
90 FH_ReadChar m_ReadCh;
91 std::string m_ReadBuf;
92 size_t m_AvailBeg;
93 };
94
95 // Data
96 C_Source m_Src;
97 C_Queue<T_Utf32> m_GetQ;
98 void (C_UnicodeIn::*m_ReadMethod)(){};
99 T_Encoding m_CodePage;
100#ifndef _WIN32
101 iconv_t m_iconv{(iconv_t)-1}; // changed according to m_CodePage
102#endif
103 int m_ErrCode{UIE_EOF};
104
105 // Nonvirtuals
106 bool guessCodePage();
107 void ingestMBCS();
108 void init();
109 void readCodePage();
110 void readASCII();
111 void readReverseUTF16();
112 void readReverseUTF32();
113 void readUTF16();
114 bool readUTF16(C_Source &src, bool reverseWord);
115 void readUTF32();
116 bool readUTF32(C_Source &src, bool reverseWord);
117 void setCodePage(T_Encoding cp);
118#ifndef _WIN32
119 void reset_iconv();
120#endif
121};
122
123//
124// Externs
125//
126extern const T_Encoding ENCODING_UTF8;
127
128std::string_view to_utf8(T_Utf32 c);
129std::string to_utf8(C_UnicodeIn &&uin);
130
131template<typename T>
132std::string to_utf8(const T *ps, size_t size = 0, T_Encoding codepage = 0)
133{
134 if (!size)
135 size = std::char_traits<T>::length(ps);
136
137 std::string_view view_as_chars{reinterpret_cast<const char*>(ps), size*sizeof(T)};
138 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
139}
140template<typename T>
141std::string to_utf8(const std::basic_string<T> &s, T_Encoding codepage = 0)
142{
143 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
144 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
145}
146template<typename T>
147std::string to_utf8(std::basic_string_view<T> s, T_Encoding codepage = 0)
148{
149 std::string_view view_as_chars{reinterpret_cast<const char*>(s.data()), s.size()*sizeof(T)};
150 return to_utf8(C_UnicodeIn(view_as_chars, codepage));
151}
152
153template<typename T>
154std::basic_string<T> BOM(std::basic_string_view<T> sv)
155{
156 if constexpr (sizeof(T) > 1)
157 return T(0xFEFF) + std::basic_string<T>(sv);
158 else
159 return std::basic_string<T>{(const T*)u8"\uFEFF"}.append(sv);
160}
161template<typename T>
162std::basic_string<T> BOM(const T *p)
163{
164 if constexpr (sizeof(T) > 1)
165 return T(0xFEFF) + std::basic_string<T>(p);
166 else
167 return std::basic_string<T>{(const T*)u8"\uFEFF"} += p;
168}
169
170} // namespace bux
171
172using bux::T_Utf32;
173using bux::T_Utf16;
174using bux::T_Utf8;
175
176static_assert(sizeof(T_Utf32) == 4);
177static_assert(sizeof(T_Utf16) == 2);
178static_assert(sizeof(T_Utf8) == 1);
C_UnicodeIn(std::string &&s, T_Encoding codepage=0)=delete
int get(T_Utf32 &c)
C_UnicodeIn(const char *s, T_Encoding codepage=0)
Definition UnicodeCvt.h:59
T_Encoding encoding() const noexcept
Definition UnicodeCvt.h:68
~C_UnicodeIn() noexcept
C_UnicodeIn(FH_ReadChar &&readc, T_Encoding codepage=0)
int lastError() const noexcept
Definition UnicodeCvt.h:67
THE common namespace of bux library.
Definition AtomiX.cpp:3
std::string_view to_utf8(T_Utf32 uc)
const char *const * T_Encoding
Definition UnicodeCvt.h:48
std::function< std::optional< char >()> FH_ReadChar
Definition UnicodeCvt.h:43
void read(const std::string &src, size_t &off, T &data) noexcept
Definition Serialize.h:35
const T_Encoding ENCODING_UTF8
std::uint16_t T_Utf16
UTF-16: You need T_Utf16[2] to hold full range of unicode.
Definition UnicodeCvt.h:40
std::basic_string< T > BOM(std::basic_string_view< T > sv)
Definition UnicodeCvt.h:154
std::uint8_t T_Utf8
UTF-8: You need T_Utf8[4] to hold full range of unicode.
Definition UnicodeCvt.h:41
@ MAX_UTF8_VALID
Definition UnicodeCvt.h:26
@ MAX_UTF8
Definition UnicodeCvt.h:24
@ UIE_ILLFORMED_UNICODE
Definition UnicodeCvt.h:30
@ UIE_EOF
Definition UnicodeCvt.h:29
@ MAX_UTF16
Definition UnicodeCvt.h:23
@ UIE_INCOMPLETE_UNICODE
Definition UnicodeCvt.h:31
@ UIE_INTERNAL
Definition UnicodeCvt.h:33
@ UIE_NO_UNICODE_TRANSLATION
Definition UnicodeCvt.h:32
@ MAX_UTF8_BMP
Definition UnicodeCvt.h:25
std::uint32_t T_Utf32
UTF-32 to cover the full range of codespace U+0000 ~ U+10FFFF.
Definition UnicodeCvt.h:39