Monero
Loading...
Searching...
No Matches
utf8.h
Go to the documentation of this file.
1// Copyright (c) 2019-2022, The Monero Project
2
3//
4// All rights reserved.
5//
6// Redistribution and use in source and binary forms, with or without modification, are
7// permitted provided that the following conditions are met:
8//
9// 1. Redistributions of source code must retain the above copyright notice, this list of
10// conditions and the following disclaimer.
11//
12// 2. Redistributions in binary form must reproduce the above copyright notice, this list
13// of conditions and the following disclaimer in the documentation and/or other
14// materials provided with the distribution.
15//
16// 3. Neither the name of the copyright holder nor the names of its contributors may be
17// used to endorse or promote products derived from this software without specific
18// prior written permission.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
21// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30#pragma once
31
32#include <cctype>
33#include <cwchar>
34#include <stdexcept>
35
36namespace tools
37{
38 template<typename T, typename Transform>
39 inline T utf8canonical(const T &s, Transform t = [](wint_t c)->wint_t { return c; })
40 {
41 T sc = "";
42 size_t avail = s.size();
43 const char *ptr = s.data();
44 wint_t cp = 0;
45 int bytes = 1;
46 char wbuf[8], *wptr;
47 while (avail--)
48 {
49 if ((*ptr & 0x80) == 0)
50 {
51 cp = *ptr++;
52 bytes = 1;
53 }
54 else if ((*ptr & 0xe0) == 0xc0)
55 {
56 if (avail < 1)
57 throw std::runtime_error("Invalid UTF-8");
58 cp = (*ptr++ & 0x1f) << 6;
59 cp |= *ptr++ & 0x3f;
60 --avail;
61 bytes = 2;
62 }
63 else if ((*ptr & 0xf0) == 0xe0)
64 {
65 if (avail < 2)
66 throw std::runtime_error("Invalid UTF-8");
67 cp = (*ptr++ & 0xf) << 12;
68 cp |= (*ptr++ & 0x3f) << 6;
69 cp |= *ptr++ & 0x3f;
70 avail -= 2;
71 bytes = 3;
72 }
73 else if ((*ptr & 0xf8) == 0xf0)
74 {
75 if (avail < 3)
76 throw std::runtime_error("Invalid UTF-8");
77 cp = (*ptr++ & 0x7) << 18;
78 cp |= (*ptr++ & 0x3f) << 12;
79 cp |= (*ptr++ & 0x3f) << 6;
80 cp |= *ptr++ & 0x3f;
81 avail -= 3;
82 bytes = 4;
83 }
84 else
85 throw std::runtime_error("Invalid UTF-8");
86
87 cp = t(cp);
88 if (cp <= 0x7f)
89 bytes = 1;
90 else if (cp <= 0x7ff)
91 bytes = 2;
92 else if (cp <= 0xffff)
93 bytes = 3;
94 else if (cp <= 0x10ffff)
95 bytes = 4;
96 else
97 throw std::runtime_error("Invalid code point UTF-8 transformation");
98
99 wptr = wbuf;
100 switch (bytes)
101 {
102 case 1: *wptr++ = cp; break;
103 case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
104 case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
105 case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr++ = 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
106 default: throw std::runtime_error("Invalid UTF-8");
107 }
108 *wptr = 0;
109 sc.append(wbuf, bytes);
110 cp = 0;
111 bytes = 1;
112 }
113 return sc;
114 }
115}
#define s(x, c)
Definition aesb.c:47
static void Transform(hashState *ctx, const uint8_t *input, int msglen)
Definition groestl.c:171
t
Definition console.py:33
Various Tools.
Definition apply_permutation.h:40
T utf8canonical(const T &s, Transform t=[](wint_t c) ->wint_t { return c;})
Definition utf8.h:39
#define T(x)