Open Chinese Convert  1.0.5
A project for conversion between Traditional and Simplified Chinese
UTF8StringSlice.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "Common.hpp"
20 #include "UTF8Util.hpp"
21 
22 namespace opencc {
23 
24 namespace internal {
25 
26 inline size_t FNVHash(const char* text, const size_t byteLength,
27  const size_t FNV_prime, const size_t FNV_offset_basis) {
28  size_t hash = FNV_offset_basis;
29  for (const char* pstr = text; pstr < text + byteLength; pstr++) {
30  hash ^= *pstr;
31  hash *= FNV_prime;
32  }
33  return hash;
34 }
35 
36 template <int> size_t FNVHash(const char* text, const size_t byteLength);
37 
38 template <>
39 inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
40  return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
41 }
42 
43 #if SIZE_MAX == 0xffffffffffffffff
44 template <>
45 inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
46  return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
47 }
48 #endif
49 
50 } // namespace internal
51 
52 template <typename LENGTH_TYPE> class UTF8StringSliceBase {
53 public:
54  typedef LENGTH_TYPE LengthType;
55 
56  UTF8StringSliceBase(const char* _str)
57  : str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
58  byteLength(static_cast<LengthType>(strlen(_str))) {}
59 
60  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
61  : str(_str), utf8Length(_utf8Length) {
62  CalculateByteLength();
63  }
64 
65  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
66  const LengthType _byteLength)
67  : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
68  CalculateByteLength();
69  }
70 
71  LengthType UTF8Length() const { return utf8Length; }
72 
73  LengthType ByteLength() const { return byteLength; }
74 
75  UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
76  if (numberOfCharacters == UTF8Length()) {
77  return *this;
78  } else {
79  return UTF8StringSliceBase(str, numberOfCharacters);
80  }
81  }
82 
83  UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
84  if (numberOfCharacters == UTF8Length()) {
85  return *this;
86  } else {
87  const char* pstr = str + byteLength;
88  for (size_t i = 0; i < numberOfCharacters; i++) {
89  pstr = UTF8Util::PrevChar(pstr);
90  }
91  return UTF8StringSliceBase(pstr, numberOfCharacters);
92  }
93  }
94 
95  UTF8StringSliceBase SubString(const LengthType offset,
96  const LengthType numberOfCharacters) const {
97  if (offset == 0) {
98  return Left(numberOfCharacters);
99  } else {
100  const char* pstr = str;
101  for (size_t i = 0; i < offset; i++) {
102  pstr = UTF8Util::NextChar(pstr);
103  }
104  return UTF8StringSliceBase(pstr, numberOfCharacters);
105  }
106  }
107 
108  string ToString() const { return string(str, str + byteLength); }
109 
110  const char* CString() const { return str; }
111 
112  LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
113  if (str == that.str) {
114  return std::min(utf8Length, that.utf8Length);
115  } else {
116  const char* pstr1 = str;
117  const char* pstr2 = that.str;
118  for (size_t length = 0; length < utf8Length && length < that.utf8Length;
119  length++) {
120  size_t charLen1 = UTF8Util::NextCharLength(pstr1);
121  size_t charLen2 = UTF8Util::NextCharLength(pstr2);
122  if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
123  return length;
124  }
125  pstr1 += charLen1;
126  pstr2 += charLen2;
127  }
128  return 0;
129  }
130  }
131 
132  void MoveRight() {
133  if (utf8Length > 0) {
134  const size_t charLen = UTF8Util::NextCharLength(str);
135  str += charLen;
136  utf8Length--;
137  byteLength -= charLen;
138  }
139  }
140 
141  void MoveLeft() {
142  if (utf8Length > 0) {
143  const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
144  utf8Length--;
145  byteLength -= charLen;
146  }
147  }
148 
149  int ReverseCompare(const UTF8StringSliceBase& that) const {
150  const char* pstr1 = str + byteLength;
151  const char* pstr2 = that.str + that.byteLength;
152  const size_t length = std::min(utf8Length, that.utf8Length);
153  for (size_t i = 0; i < length; i++) {
154  const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
155  const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
156  pstr1 -= charLen1;
157  pstr2 -= charLen2;
158  const int cmp = strncmp(pstr1, pstr2, std::min(charLen1, charLen2));
159  if (cmp < 0) {
160  return -1;
161  } else if (cmp > 0) {
162  return 1;
163  } else if (charLen1 < charLen2) {
164  return -1;
165  } else if (charLen1 > charLen2) {
166  return 1;
167  }
168  }
169  if (utf8Length < that.utf8Length) {
170  return -1;
171  } else if (utf8Length > that.utf8Length) {
172  return 1;
173  } else {
174  return 0;
175  }
176  }
177 
178  LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
179  return static_cast<LengthType>(
180  ToString().find(pattern.str, 0, pattern.byteLength));
181  }
182 
183  bool operator<(const UTF8StringSliceBase& that) const {
184  return Compare(that) < 0;
185  }
186 
187  bool operator>(const UTF8StringSliceBase& that) const {
188  return Compare(that) > 0;
189  }
190 
191  bool operator==(const UTF8StringSliceBase& that) const {
192  return (str == that.str && utf8Length == that.utf8Length) ||
193  Compare(that) == 0;
194  }
195 
196  bool operator!=(const UTF8StringSliceBase& that) const {
197  return !this->operator==(that);
198  }
199 
200  class Hasher {
201  public:
202  size_t operator()(const UTF8StringSliceBase& text) const {
203  return internal::FNVHash<sizeof(size_t)>(text.CString(),
204  text.ByteLength());
205  }
206  };
207 
208 private:
209  inline int Compare(const UTF8StringSliceBase& that) const {
210  int cmp = strncmp(str, that.str, std::min(byteLength, that.byteLength));
211  if (cmp == 0) {
212  if (utf8Length < that.utf8Length) {
213  cmp = -1;
214  } else if (utf8Length > that.utf8Length) {
215  cmp = 1;
216  } else {
217  cmp = 0;
218  }
219  }
220  return cmp;
221  }
222 
223  void CalculateByteLength() {
224  const char* pstr = str;
225  for (size_t i = 0; i < utf8Length; i++) {
226  pstr = UTF8Util::NextChar(pstr);
227  }
228  byteLength = static_cast<LengthType>(pstr - str);
229  }
230 
231  const char* str;
232  LengthType utf8Length;
233  LengthType byteLength;
234 };
235 
236 typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
237 
238 template <typename LENGTH_TYPE>
239 std::ostream& operator<<(::std::ostream& os,
240  const UTF8StringSliceBase<LENGTH_TYPE>& str) {
241  return os << str.ToString();
242 }
243 
244 } // namespace opencc
opencc::UTF8Util::Length
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 string.
Definition: UTF8Util.hpp:122
opencc::UTF8StringSliceBase::Hasher
Definition: UTF8StringSlice.hpp:200
opencc::UTF8StringSliceBase
Definition: UTF8StringSlice.hpp:52
opencc::UTF8Util::NextCharLength
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:66
opencc::UTF8Util::PrevCharLength
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:77
opencc::UTF8Util::PrevChar
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:115
opencc::UTF8Util::NextChar
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:108