Monero
utf8.h
Go to the documentation of this file.
1 // Copyright (c) 2019-2022, The Monero Project
2 
3 //
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without modification, are
7 // permitted provided that the following conditions are met:
8 //
9 // 1. Redistributions of source code must retain the above copyright notice, this list of
10 // conditions and the following disclaimer.
11 //
12 // 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 // of conditions and the following disclaimer in the documentation and/or other
14 // materials provided with the distribution.
15 //
16 // 3. Neither the name of the copyright holder nor the names of its contributors may be
17 // used to endorse or promote products derived from this software without specific
18 // prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
21 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 // THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28 // THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #pragma once
31 
32 #include <cctype>
33 #include <cwchar>
34 #include <stdexcept>
35 
36 namespace tools
37 {
38  template<typename T, typename Transform>
39  inline T utf8canonical(const T &s, Transform t = [](wint_t c)->wint_t { return c; })
40  {
41  T sc = "";
42  size_t avail = s.size();
43  const char *ptr = s.data();
44  wint_t cp = 0;
45  int bytes = 1;
46  char wbuf[8], *wptr;
47  while (avail--)
48  {
49  if ((*ptr & 0x80) == 0)
50  {
51  cp = *ptr++;
52  bytes = 1;
53  }
54  else if ((*ptr & 0xe0) == 0xc0)
55  {
56  if (avail < 1)
57  throw std::runtime_error("Invalid UTF-8");
58  cp = (*ptr++ & 0x1f) << 6;
59  cp |= *ptr++ & 0x3f;
60  --avail;
61  bytes = 2;
62  }
63  else if ((*ptr & 0xf0) == 0xe0)
64  {
65  if (avail < 2)
66  throw std::runtime_error("Invalid UTF-8");
67  cp = (*ptr++ & 0xf) << 12;
68  cp |= (*ptr++ & 0x3f) << 6;
69  cp |= *ptr++ & 0x3f;
70  avail -= 2;
71  bytes = 3;
72  }
73  else if ((*ptr & 0xf8) == 0xf0)
74  {
75  if (avail < 3)
76  throw std::runtime_error("Invalid UTF-8");
77  cp = (*ptr++ & 0x7) << 18;
78  cp |= (*ptr++ & 0x3f) << 12;
79  cp |= (*ptr++ & 0x3f) << 6;
80  cp |= *ptr++ & 0x3f;
81  avail -= 3;
82  bytes = 4;
83  }
84  else
85  throw std::runtime_error("Invalid UTF-8");
86 
87  cp = t(cp);
88  if (cp <= 0x7f)
89  bytes = 1;
90  else if (cp <= 0x7ff)
91  bytes = 2;
92  else if (cp <= 0xffff)
93  bytes = 3;
94  else if (cp <= 0x10ffff)
95  bytes = 4;
96  else
97  throw std::runtime_error("Invalid code point UTF-8 transformation");
98 
99  wptr = wbuf;
100  switch (bytes)
101  {
102  case 1: *wptr++ = cp; break;
103  case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
104  case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
105  case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr++ = 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
106  default: throw std::runtime_error("Invalid UTF-8");
107  }
108  *wptr = 0;
109  sc.append(wbuf, bytes);
110  cp = 0;
111  bytes = 1;
112  }
113  return sc;
114  }
115 }
const uint32_t T[512]
Definition: groestl_tables.h:36
t
Definition: console.py:33
const char * s
Definition: minissdp.c:596
static void Transform(hashState *ctx, const uint8_t *input, int msglen)
Definition: groestl.c:171
Various Tools.
Definition: apply_permutation.h:39
T utf8canonical(const T &s, Transform t=[](wint_t c) ->wint_t { return c;})
Definition: utf8.h:39
c
Definition: pymoduletest.py:79