Monero
utf8.h
Go to the documentation of this file.
1 // Copyright (c) 2019, The Monero Project
2 //
3 // All rights reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without modification, are
6 // permitted provided that the following conditions are met:
7 //
8 // 1. Redistributions of source code must retain the above copyright notice, this list of
9 // conditions and the following disclaimer.
10 //
11 // 2. Redistributions in binary form must reproduce the above copyright notice, this list
12 // of conditions and the following disclaimer in the documentation and/or other
13 // materials provided with the distribution.
14 //
15 // 3. Neither the name of the copyright holder nor the names of its contributors may be
16 // used to endorse or promote products derived from this software without specific
17 // prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
20 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
21 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 // THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
27 // THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 
29 #pragma once
30 
31 #include <cctype>
32 #include <cwchar>
33 #include <stdexcept>
34 
35 namespace tools
36 {
37  template<typename T, typename Transform>
38  inline T utf8canonical(const T &s, Transform t = [](wint_t c)->wint_t { return c; })
39  {
40  T sc = "";
41  size_t avail = s.size();
42  const char *ptr = s.data();
43  wint_t cp = 0;
44  int bytes = 1;
45  char wbuf[8], *wptr;
46  while (avail--)
47  {
48  if ((*ptr & 0x80) == 0)
49  {
50  cp = *ptr++;
51  bytes = 1;
52  }
53  else if ((*ptr & 0xe0) == 0xc0)
54  {
55  if (avail < 1)
56  throw std::runtime_error("Invalid UTF-8");
57  cp = (*ptr++ & 0x1f) << 6;
58  cp |= *ptr++ & 0x3f;
59  --avail;
60  bytes = 2;
61  }
62  else if ((*ptr & 0xf0) == 0xe0)
63  {
64  if (avail < 2)
65  throw std::runtime_error("Invalid UTF-8");
66  cp = (*ptr++ & 0xf) << 12;
67  cp |= (*ptr++ & 0x3f) << 6;
68  cp |= *ptr++ & 0x3f;
69  avail -= 2;
70  bytes = 3;
71  }
72  else if ((*ptr & 0xf8) == 0xf0)
73  {
74  if (avail < 3)
75  throw std::runtime_error("Invalid UTF-8");
76  cp = (*ptr++ & 0x7) << 18;
77  cp |= (*ptr++ & 0x3f) << 12;
78  cp |= (*ptr++ & 0x3f) << 6;
79  cp |= *ptr++ & 0x3f;
80  avail -= 3;
81  bytes = 4;
82  }
83  else
84  throw std::runtime_error("Invalid UTF-8");
85 
86  cp = t(cp);
87  if (cp <= 0x7f)
88  bytes = 1;
89  else if (cp <= 0x7ff)
90  bytes = 2;
91  else if (cp <= 0xffff)
92  bytes = 3;
93  else if (cp <= 0x10ffff)
94  bytes = 4;
95  else
96  throw std::runtime_error("Invalid code point UTF-8 transformation");
97 
98  wptr = wbuf;
99  switch (bytes)
100  {
101  case 1: *wptr++ = cp; break;
102  case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
103  case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
104  case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr++ = 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
105  default: throw std::runtime_error("Invalid UTF-8");
106  }
107  *wptr = 0;
108  sc.append(wbuf, bytes);
109  cp = 0;
110  bytes = 1;
111  }
112  return sc;
113  }
114 }
#define s(x, c)
Definition: aesb.c:47
static void Transform(hashState *ctx, const uint8_t *input, int msglen)
Definition: groestl.c:171
const uint32_t T[512]
Definition: groestl_tables.h:36
Various Tools.
Definition: apply_permutation.h:40
T utf8canonical(const T &s, Transform t=[](wint_t c) ->wint_t { return c;})
Definition: utf8.h:38