Electroneum
Loading...
Searching...
No Matches
language_base.h
Go to the documentation of this file.
1// Copyrights(c) 2017-2021, The Electroneum Project
2// Copyrights(c) 2014-2019, The Monero Project
3//
4// All rights reserved.
5//
6// Redistribution and use in source and binary forms, with or without modification, are
7// permitted provided that the following conditions are met:
8//
9// 1. Redistributions of source code must retain the above copyright notice, this list of
10// conditions and the following disclaimer.
11//
12// 2. Redistributions in binary form must reproduce the above copyright notice, this list
13// of conditions and the following disclaimer in the documentation and/or other
14// materials provided with the distribution.
15//
16// 3. Neither the name of the copyright holder nor the names of its contributors may be
17// used to endorse or promote products derived from this software without specific
18// prior written permission.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
21// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
35
36#ifndef LANGUAGE_BASE_H
37#define LANGUAGE_BASE_H
38
39#include <vector>
40#include <unordered_map>
41#include <string>
42#include <boost/algorithm/string.hpp>
43#include "misc_log_ex.h"
44#include "fnv1.h"
45
50namespace Language
51{
59 template<typename T>
60 inline T utf8prefix(const T &s, size_t count)
61 {
62 T prefix = "";
63 size_t avail = s.size();
64 const char *ptr = s.data();
65 while (count-- && avail--)
66 {
67 prefix += *ptr++;
68 while (avail && ((*ptr) & 0xc0) == 0x80)
69 {
70 prefix += *ptr++;
71 --avail;
72 }
73 }
74 return prefix;
75 }
76
77 template<typename T>
78 inline T utf8canonical(const T &s)
79 {
80 T sc = "";
81 size_t avail = s.size();
82 const char *ptr = s.data();
83 wint_t cp = 0;
84 int bytes = 1;
85 char wbuf[8], *wptr;
86 while (avail--)
87 {
88 if ((*ptr & 0x80) == 0)
89 {
90 cp = *ptr++;
91 bytes = 1;
92 }
93 else if ((*ptr & 0xe0) == 0xc0)
94 {
95 if (avail < 1)
96 throw std::runtime_error("Invalid UTF-8");
97 cp = (*ptr++ & 0x1f) << 6;
98 cp |= *ptr++ & 0x3f;
99 --avail;
100 bytes = 2;
101 }
102 else if ((*ptr & 0xf0) == 0xe0)
103 {
104 if (avail < 2)
105 throw std::runtime_error("Invalid UTF-8");
106 cp = (*ptr++ & 0xf) << 12;
107 cp |= (*ptr++ & 0x3f) << 6;
108 cp |= *ptr++ & 0x3f;
109 avail -= 2;
110 bytes = 3;
111 }
112 else if ((*ptr & 0xf8) == 0xf0)
113 {
114 if (avail < 3)
115 throw std::runtime_error("Invalid UTF-8");
116 cp = (*ptr++ & 0x7) << 18;
117 cp |= (*ptr++ & 0x3f) << 12;
118 cp |= (*ptr++ & 0x3f) << 6;
119 cp |= *ptr++ & 0x3f;
120 avail -= 3;
121 bytes = 4;
122 }
123 else
124 throw std::runtime_error("Invalid UTF-8");
125
126 cp = std::towlower(cp);
127 wptr = wbuf;
128 switch (bytes)
129 {
130 case 1: *wptr++ = cp; break;
131 case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
132 case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
133 case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr++ = 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
134 default: throw std::runtime_error("Invalid UTF-8");
135 }
136 *wptr = 0;
137 sc += T(wbuf, bytes);
138 cp = 0;
139 bytes = 1;
140 }
141 return sc;
142 }
143
144 struct WordHash
145 {
146 std::size_t operator()(const epee::wipeable_string &s) const
147 {
149 return epee::fnv::FNV1a(sc.data(), sc.size());
150 }
151 };
152
154 {
156 {
157 const epee::wipeable_string s0c = utf8canonical(s0);
158 const epee::wipeable_string s1c = utf8canonical(s1);
159 return s0c == s1c;
160 }
161 };
162
168 class Base
169 {
170 protected:
171 enum {
174 };
175 enum {
176 NWORDS = 1626
177 };
178 std::vector<std::string> word_list;
179 std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> word_map;
180 std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> trimmed_word_map;
181 std::string language_name;
187 void populate_maps(uint32_t flags = 0)
188 {
189 int ii;
190 std::vector<std::string>::const_iterator it;
191 if (word_list.size () != NWORDS)
192 throw std::runtime_error("Wrong word list length for " + language_name);
193 for (it = word_list.begin(), ii = 0; it != word_list.end(); it++, ii++)
194 {
195 word_map[*it] = ii;
196 if ((*it).size() < unique_prefix_length)
197 {
198 if (flags & ALLOW_SHORT_WORDS)
199 MWARNING(language_name << " word '" << *it << "' is shorter than its prefix length, " << unique_prefix_length);
200 else
201 throw std::runtime_error("Too short word in " + language_name + " word list: " + *it);
202 }
203 epee::wipeable_string trimmed;
204 if (it->length() > unique_prefix_length)
205 {
206 trimmed = utf8prefix(*it, unique_prefix_length);
207 }
208 else
209 {
210 trimmed = *it;
211 }
212 if (trimmed_word_map.find(trimmed) != trimmed_word_map.end())
213 {
214 if (flags & ALLOW_DUPLICATE_PREFIXES)
215 MWARNING("Duplicate prefix in " << language_name << " word list: " << std::string(trimmed.data(), trimmed.size()));
216 else
217 throw std::runtime_error("Duplicate prefix in " + language_name + " word list: " + std::string(trimmed.data(), trimmed.size()));
218 }
219 trimmed_word_map[trimmed] = ii;
220 }
221 }
222 public:
223 Base(const char *language_name, const char *english_language_name, const std::vector<std::string> &words, uint32_t prefix_length):
224 word_list(words),
225 unique_prefix_length(prefix_length),
228 {
229 }
230 virtual ~Base()
231 {
232 }
233 void set_words(const char * const words[])
234 {
235 word_list.resize(NWORDS);
236 for (size_t i = 0; i < NWORDS; ++i)
237 word_list[i] = words[i];
238 }
239
243 const std::vector<std::string>& get_word_list() const
244 {
245 return word_list;
246 }
247
251 const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_word_map() const
252 {
253 return word_map;
254 }
255
259 const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_trimmed_word_map() const
260 {
261 return trimmed_word_map;
262 }
263
267 const std::string &get_language_name() const
268 {
269 return language_name;
270 }
271
275 const std::string &get_english_language_name() const
276 {
278 }
279
287 };
288}
289
290#endif
const std::vector< std::string > & get_word_list() const
Returns a pointer to the word list.
std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > word_map
const std::string & get_language_name() const
Returns the name of the language.
std::string language_name
const std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > & get_trimmed_word_map() const
Returns a pointer to the trimmed word map.
const std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > & get_word_map() const
Returns a pointer to the word map.
void set_words(const char *const words[])
std::vector< std::string > word_list
std::string english_language_name
Base(const char *language_name, const char *english_language_name, const std::vector< std::string > &words, uint32_t prefix_length)
const std::string & get_english_language_name() const
Returns the name of the language in English.
void populate_maps(uint32_t flags=0)
Populates the word maps after the list is ready.
std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > trimmed_word_map
uint32_t unique_prefix_length
uint32_t get_unique_prefix_length() const
Returns the number of unique starting characters to be used for matching.
const char * data() const noexcept
size_t size() const noexcept
#define MWARNING(x)
Definition misc_log_ex.h:74
Mnemonic language related namespace.
T utf8prefix(const T &s, size_t count)
Returns a string made of (at most) the first count characters in s. Assumes well formedness....
T utf8canonical(const T &s)
uint64_t FNV1a(const char *ptr, size_t sz)
Definition fnv1.h:36
unsigned int uint32_t
Definition stdint.h:126
bool operator()(const epee::wipeable_string &s0, const epee::wipeable_string &s1) const
std::size_t operator()(const epee::wipeable_string &s) const
#define T(x)