Open Chinese Convert  1.0.5
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #ifdef _MSC_VER
22 #define NOMINMAX
23 #include <Windows.h>
24 #undef NOMINMAX
25 #endif // _MSC_VER
26 
27 #include "Common.hpp"
28 
29 namespace opencc {
34 class OPENCC_EXPORT UTF8Util {
35 public:
39  static void SkipUtf8Bom(FILE* fp);
40 
45  static size_t NextCharLengthNoException(const char* str) {
46  char ch = *str;
47  if ((ch & 0xF0) == 0xE0) {
48  return 3;
49  } else if ((ch & 0x80) == 0x00) {
50  return 1;
51  } else if ((ch & 0xE0) == 0xC0) {
52  return 2;
53  } else if ((ch & 0xF8) == 0xF0) {
54  return 4;
55  } else if ((ch & 0xFC) == 0xF8) {
56  return 5;
57  } else if ((ch & 0xFE) == 0xFC) {
58  return 6;
59  }
60  return 0;
61  }
62 
66  static size_t NextCharLength(const char* str) {
67  size_t length = NextCharLengthNoException(str);
68  if (length == 0) {
69  throw InvalidUTF8(str);
70  }
71  return length;
72  }
73 
77  static size_t PrevCharLength(const char* str) {
78  {
79  const size_t length = NextCharLengthNoException(str - 3);
80  if (length == 3) {
81  return length;
82  }
83  }
84  {
85  const size_t length = NextCharLengthNoException(str - 1);
86  if (length == 1) {
87  return length;
88  }
89  }
90  {
91  const size_t length = NextCharLengthNoException(str - 2);
92  if (length == 2) {
93  return length;
94  }
95  }
96  for (size_t i = 4; i <= 6; i++) {
97  const size_t length = NextCharLengthNoException(str - i);
98  if (length == i) {
99  return length;
100  }
101  }
102  throw InvalidUTF8(str);
103  }
104 
108  static const char* NextChar(const char* str) {
109  return str + NextCharLength(str);
110  }
111 
115  static const char* PrevChar(const char* str) {
116  return str - PrevCharLength(str);
117  }
118 
122  static size_t Length(const char* str) {
123  size_t length = 0;
124  while (*str != '\0') {
125  str = NextChar(str);
126  length++;
127  }
128  return length;
129  }
130 
137  static const char* FindNextInline(const char* str, const char ch) {
138  while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
139  str = NextChar(str);
140  }
141  return str;
142  }
143 
147  static bool IsLineEndingOrFileEnding(const char ch) {
148  return ch == '\0' || ch == '\n' || ch == '\r';
149  }
150 
154  static string FromSubstr(const char* str, size_t length) {
155  string newStr;
156  newStr.resize(length);
157  strncpy(const_cast<char*>(newStr.c_str()), str, length);
158  return newStr;
159  }
160 
164  static bool NotShorterThan(const char* str, size_t byteLength) {
165  while (byteLength > 0) {
166  if (*str == '\0') {
167  return false;
168  }
169  byteLength--;
170  str++;
171  }
172  return true;
173  }
174 
179  static string TruncateUTF8(const char* str, size_t maxByteLength) {
180  string wordTrunc;
181  if (NotShorterThan(str, maxByteLength)) {
182  size_t len = 0;
183  const char* pStr = str;
184  for (;;) {
185  const size_t charLength = NextCharLength(pStr);
186  if (len + charLength > maxByteLength) {
187  break;
188  }
189  pStr += charLength;
190  len += charLength;
191  }
192  wordTrunc = FromSubstr(str, len);
193  } else {
194  wordTrunc = str;
195  }
196  return wordTrunc;
197  }
198 
202  static void ReplaceAll(string& str, const char* from, const char* to) {
203  string::size_type pos = 0;
204  string::size_type fromLen = strlen(from);
205  string::size_type toLen = strlen(to);
206  while ((pos = str.find(from, pos)) != string::npos) {
207  str.replace(pos, fromLen, to);
208  pos += toLen;
209  }
210  }
211 
215  static string Join(const vector<string>& strings, const string& separator) {
216  std::ostringstream buffer;
217  bool first = true;
218  for (const auto& str : strings) {
219  if (!first) {
220  buffer << separator;
221  }
222  buffer << str;
223  first = false;
224  }
225  return buffer.str();
226  }
227 
231  static string Join(const vector<string>& strings) {
232  std::ostringstream buffer;
233  for (const auto& str : strings) {
234  buffer << str;
235  }
236  return buffer.str();
237  }
238 
239  static void GetByteMap(const char* str, const size_t utf8Length,
240  vector<size_t>* byteMap) {
241  if (byteMap->size() < utf8Length) {
242  byteMap->resize(utf8Length);
243  }
244  const char* pstr = str;
245  for (size_t i = 0; i < utf8Length; i++) {
246  (*byteMap)[i] = pstr - str;
247  pstr = NextChar(pstr);
248  }
249  }
250 
251 #ifdef _MSC_VER
252  static std::wstring GetPlatformString(const std::string& str) {
253  return U8ToU16(str);
254  }
255 #else
256  static std::string GetPlatformString(const std::string& str) {
257  return str;
258  }
259 #endif // _MSC_VER
260 
261 
262 #ifdef _MSC_VER
263  static std::string U16ToU8(const std::wstring& wstr) {
264  std::string ret;
265  int length = static_cast<int>(wstr.length());
266  int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0, NULL, NULL);
267  if (convcnt > 0) {
268  ret.resize(convcnt);
269  WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt, NULL, NULL);
270  }
271  return ret;
272  }
273 
274  static std::wstring U8ToU16(const std::string& str) {
275  std::wstring ret;
276  int length = static_cast<int>(str.length());
277  int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
278  if (convcnt > 0) {
279  ret.resize(convcnt);
280  MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
281  }
282  return ret;
283  }
284 #endif // _MSC_VER
285 };
286 }
opencc::UTF8Util::Length
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 string.
Definition: UTF8Util.hpp:122
opencc::UTF8Util::FindNextInline
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:137
opencc::UTF8Util::NextCharLengthNoException
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:45
opencc::UTF8Util::Join
static string Join(const vector< string > &strings)
Joins a string vector in to a string.
Definition: UTF8Util.hpp:231
opencc::UTF8Util::NotShorterThan
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given string is longer or as long as the given length.
Definition: UTF8Util.hpp:164
opencc::UTF8Util::Join
static string Join(const vector< string > &strings, const string &separator)
Joins a string vector in to a string with a separator.
Definition: UTF8Util.hpp:215
opencc::UTF8Util
UTF8 string utilities.
Definition: UTF8Util.hpp:34
opencc::UTF8Util::NextCharLength
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:66
opencc::InvalidUTF8
Definition: Exception.hpp:76
opencc::UTF8Util::TruncateUTF8
static string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a string with a maximal length in byte.
Definition: UTF8Util.hpp:179
opencc::UTF8Util::IsLineEndingOrFileEnding
static bool IsLineEndingOrFileEnding(const char ch)
Returns ture if the character is a line ending or end of file.
Definition: UTF8Util.hpp:147
opencc::UTF8Util::FromSubstr
static string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new std::string.
Definition: UTF8Util.hpp:154
opencc::UTF8Util::ReplaceAll
static void ReplaceAll(string &str, const char *from, const char *to)
Replaces all patterns in a string in place.
Definition: UTF8Util.hpp:202
opencc::UTF8Util::PrevCharLength
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:77
opencc::UTF8Util::PrevChar
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:115
opencc::UTF8Util::NextChar
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:108