Ada 3.3.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1#include "ada/common_defs.h"
4#include "ada/unicode.h"
5#include "ada/log.h"
6
8#include "ada_idna.cpp"
10
11#include <algorithm>
12#if ADA_NEON
13#include <arm_neon.h>
14#elif ADA_SSE2
15#include <emmintrin.h>
16#elif ADA_LSX
17#include <lsxintrin.h>
18#endif
19
20#include <ranges>
21
22namespace ada::unicode {
23
24constexpr bool is_tabs_or_newline(char c) noexcept {
25 return c == '\r' || c == '\n' || c == '\t';
26}
27
28constexpr uint64_t broadcast(uint8_t v) noexcept {
29 return 0x101010101010101ull * v;
30}
31
32constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
33 uint64_t broadcast_80 = broadcast(0x80);
34 uint64_t broadcast_Ap = broadcast(128 - 'A');
35 uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1);
36 uint64_t non_ascii = 0;
37 size_t i = 0;
38
39 for (; i + 7 < length; i += 8) {
40 uint64_t word{};
41 memcpy(&word, input + i, sizeof(word));
42 non_ascii |= (word & broadcast_80);
43 word ^=
44 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
45 memcpy(input + i, &word, sizeof(word));
46 }
47 if (i < length) {
48 uint64_t word{};
49 memcpy(&word, input + i, length - i);
50 non_ascii |= (word & broadcast_80);
51 word ^=
52 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
53 memcpy(input + i, &word, length - i);
54 }
55 return non_ascii == 0;
56}
57#if ADA_NEON
58ada_really_inline bool has_tabs_or_newline(
59 std::string_view user_input) noexcept {
60 // first check for short strings in which case we do it naively.
61 if (user_input.size() < 16) { // slow path
62 return std::ranges::any_of(user_input, is_tabs_or_newline);
63 }
64 // fast path for long strings (expected to be common)
65 size_t i = 0;
78 static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
79 0, 9, 10, 0, 0, 13, 0, 0};
80 const uint8x16_t rnt = vld1q_u8(rnt_array);
81 // m['0xd', '0xa', '0x9']
82 uint8x16_t running{0};
83 for (; i + 15 < user_input.size(); i += 16) {
84 uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
85
86 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
87 }
88 if (i < user_input.size()) {
89 uint8x16_t word =
90 vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
91 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
92 }
93 return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
94}
95#elif ADA_SSE2
96ada_really_inline bool has_tabs_or_newline(
97 std::string_view user_input) noexcept {
98 // first check for short strings in which case we do it naively.
99 if (user_input.size() < 16) { // slow path
100 return std::ranges::any_of(user_input, is_tabs_or_newline);
101 }
102 // fast path for long strings (expected to be common)
103 size_t i = 0;
104 const __m128i mask1 = _mm_set1_epi8('\r');
105 const __m128i mask2 = _mm_set1_epi8('\n');
106 const __m128i mask3 = _mm_set1_epi8('\t');
107 // If we supported SSSE3, we could use the algorithm that we use for NEON.
108 __m128i running{0};
109 for (; i + 15 < user_input.size(); i += 16) {
110 __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
111 running = _mm_or_si128(
112 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
113 _mm_cmpeq_epi8(word, mask2))),
114 _mm_cmpeq_epi8(word, mask3));
115 }
116 if (i < user_input.size()) {
117 __m128i word = _mm_loadu_si128(
118 (const __m128i*)(user_input.data() + user_input.length() - 16));
119 running = _mm_or_si128(
120 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
121 _mm_cmpeq_epi8(word, mask2))),
122 _mm_cmpeq_epi8(word, mask3));
123 }
124 return _mm_movemask_epi8(running) != 0;
125}
126#elif ADA_LSX
127ada_really_inline bool has_tabs_or_newline(
128 std::string_view user_input) noexcept {
129 // first check for short strings in which case we do it naively.
130 if (user_input.size() < 16) { // slow path
131 return std::ranges::any_of(user_input, is_tabs_or_newline);
132 }
133 // fast path for long strings (expected to be common)
134 size_t i = 0;
135 const __m128i mask1 = __lsx_vrepli_b('\r');
136 const __m128i mask2 = __lsx_vrepli_b('\n');
137 const __m128i mask3 = __lsx_vrepli_b('\t');
138 // If we supported SSSE3, we could use the algorithm that we use for NEON.
139 __m128i running{0};
140 for (; i + 15 < user_input.size(); i += 16) {
141 __m128i word = __lsx_vld((const __m128i*)(user_input.data() + i), 0);
142 running = __lsx_vor_v(
143 __lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1),
144 __lsx_vseq_b(word, mask2))),
145 __lsx_vseq_b(word, mask3));
146 }
147 if (i < user_input.size()) {
148 __m128i word = __lsx_vld(
149 (const __m128i*)(user_input.data() + user_input.length() - 16), 0);
150 running = __lsx_vor_v(
151 __lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1),
152 __lsx_vseq_b(word, mask2))),
153 __lsx_vseq_b(word, mask3));
154 }
155 if (__lsx_bz_v(running)) return false;
156 return true;
157}
158#else
159ada_really_inline bool has_tabs_or_newline(
160 std::string_view user_input) noexcept {
161 auto has_zero_byte = [](uint64_t v) {
162 return ((v - 0x0101010101010101) & ~(v) & 0x8080808080808080);
163 };
164 size_t i = 0;
165 uint64_t mask1 = broadcast('\r');
166 uint64_t mask2 = broadcast('\n');
167 uint64_t mask3 = broadcast('\t');
168 uint64_t running{0};
169 for (; i + 7 < user_input.size(); i += 8) {
170 uint64_t word{};
171 memcpy(&word, user_input.data() + i, sizeof(word));
172 uint64_t xor1 = word ^ mask1;
173 uint64_t xor2 = word ^ mask2;
174 uint64_t xor3 = word ^ mask3;
175 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
176 }
177 if (i < user_input.size()) {
178 uint64_t word{};
179 memcpy(&word, user_input.data() + i, user_input.size() - i);
180 uint64_t xor1 = word ^ mask1;
181 uint64_t xor2 = word ^ mask2;
182 uint64_t xor3 = word ^ mask3;
183 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
184 }
185 return running;
186}
187#endif
188
189// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
190// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
191// U+003F (?), U+0040 (@), U+005B ([), U+005C (\‍), U+005D (]), U+005E (^), or
192// U+007C (|).
193constexpr static std::array<uint8_t, 256> is_forbidden_host_code_point_table =
194 []() consteval {
195 std::array<uint8_t, 256> result{};
196 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
197 '>', '?', '@', '[', '\\', ']', '^', '|'}) {
198 result[c] = true;
199 }
200 return result;
201 }();
202
203ada_really_inline constexpr bool is_forbidden_host_code_point(
204 const char c) noexcept {
205 return is_forbidden_host_code_point_table[uint8_t(c)];
206}
207
208constexpr static std::array<uint8_t, 256> is_forbidden_domain_code_point_table =
209 []() consteval {
210 std::array<uint8_t, 256> result{};
211 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
212 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
213 result[c] = true;
214 }
215 for (uint8_t c = 0; c <= 32; c++) {
216 result[c] = true;
217 }
218 for (size_t c = 127; c < 255; c++) {
219 result[c] = true;
220 }
221 return result;
222 }();
223
224static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
225
226ada_really_inline constexpr bool is_forbidden_domain_code_point(
227 const char c) noexcept {
228 return is_forbidden_domain_code_point_table[uint8_t(c)];
229}
230
231ada_really_inline constexpr bool contains_forbidden_domain_code_point(
232 const char* input, size_t length) noexcept {
233 size_t i = 0;
234 uint8_t accumulator{};
235 for (; i + 4 <= length; i += 4) {
236 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
237 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 1])];
238 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 2])];
239 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 3])];
240 }
241 for (; i < length; i++) {
242 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
243 }
244 return accumulator;
245}
246
247constexpr static std::array<uint8_t, 256>
249 std::array<uint8_t, 256> result{};
250 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
251 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
252 result[c] = 1;
253 }
254 for (uint8_t c = 'A'; c <= 'Z'; c++) {
255 result[c] = 2;
256 }
257 for (uint8_t c = 0; c <= 32; c++) {
258 result[c] = 1;
259 }
260 for (size_t c = 127; c < 255; c++) {
261 result[c] = 1;
262 }
263 return result;
264 }();
265
266ada_really_inline constexpr uint8_t
267contains_forbidden_domain_code_point_or_upper(const char* input,
268 size_t length) noexcept {
269 size_t i = 0;
270 uint8_t accumulator{};
271 for (; i + 4 <= length; i += 4) {
272 accumulator |=
274 accumulator |=
276 accumulator |=
278 accumulator |=
280 }
281 for (; i < length; i++) {
282 accumulator |=
284 }
285 return accumulator;
286}
287
288// std::isalnum(c) || c == '+' || c == '-' || c == '.') is true for
289constexpr static std::array<bool, 256> is_alnum_plus_table = []() consteval {
290 std::array<bool, 256> result{};
291 for (size_t c = 0; c < 256; c++) {
292 result[c] = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
293 (c >= 'A' && c <= 'Z') || c == '+' || c == '-' || c == '.';
294 }
295 return result;
296}();
297
298ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept {
299 return is_alnum_plus_table[uint8_t(c)];
300 // A table is almost surely much faster than the
301 // following under most compilers: return
302 // return (std::isalnum(c) || c == '+' || c == '-' || c == '.');
303}
304
305ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept {
306 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
307 (c >= 'a' && c <= 'f');
308}
309
310ada_really_inline constexpr bool is_ascii_digit(const char c) noexcept {
311 // An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9),
312 // inclusive.
313 return (c >= '0' && c <= '9');
314}
315
316ada_really_inline constexpr bool is_ascii(const char32_t c) noexcept {
317 // If code point is between U+0000 and U+007F inclusive, then return true.
318 return c <= 0x7F;
319}
320
321ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept {
322 return (unsigned char)c <= ' ';
323}
324
325ada_really_inline constexpr bool is_ascii_tab_or_newline(
326 const char c) noexcept {
327 return c == '\t' || c == '\n' || c == '\r';
328}
329
330constexpr std::string_view table_is_double_dot_path_segment[] = {
331 "..", "%2e.", ".%2e", "%2e%2e"};
332
333ada_really_inline constexpr bool is_double_dot_path_segment(
334 std::string_view input) noexcept {
335 // This will catch most cases:
336 // The length must be 2,4 or 6.
337 // We divide by two and require
338 // that the result be between 1 and 3 inclusively.
339 uint64_t half_length = uint64_t(input.size()) / 2;
340 if (half_length - 1 > 2) {
341 return false;
342 }
343 // We have a string of length 2, 4 or 6.
344 // We now check the first character:
345 if ((input[0] != '.') && (input[0] != '%')) {
346 return false;
347 }
348 // We are unlikely the get beyond this point.
349 int hash_value = (input.size() + (unsigned)(input[0])) & 3;
350 const std::string_view target = table_is_double_dot_path_segment[hash_value];
351 if (target.size() != input.size()) {
352 return false;
353 }
354 // We almost never get here.
355 // Optimizing the rest is relatively unimportant.
356 auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) {
357 uint16_t A, B;
358 memcpy(&A, a.data(), sizeof(A));
359 memcpy(&B, b.data(), sizeof(B));
360 return A == B;
361 };
362 if (!prefix_equal_unsafe(input, target)) {
363 return false;
364 }
365 for (size_t i = 2; i < input.size(); i++) {
366 char c = input[i];
367 if ((uint8_t((c | 0x20) - 0x61) <= 25 ? (c | 0x20) : c) != target[i]) {
368 return false;
369 }
370 }
371 return true;
372 // The above code might be a bit better than the code below. Compilers
373 // are not stupid and may use the fact that these strings have length 2,4 and
374 // 6 and other tricks.
375 // return input == ".." ||
376 // input == ".%2e" || input == ".%2E" ||
377 // input == "%2e." || input == "%2E." ||
378 // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input ==
379 // "%2e%2E";
380}
381
382ada_really_inline constexpr bool is_single_dot_path_segment(
383 std::string_view input) noexcept {
384 return input == "." || input == "%2e" || input == "%2E";
385}
386
387ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept {
388 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
389}
390
391constexpr static char hex_to_binary_table[] = {
392 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11,
393 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
394 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
395unsigned constexpr convert_hex_to_binary(const char c) noexcept {
396 return hex_to_binary_table[c - '0'];
397}
398
399std::string percent_decode(const std::string_view input, size_t first_percent) {
400 // next line is for safety only, we expect users to avoid calling
401 // percent_decode when first_percent is outside the range.
402 if (first_percent == std::string_view::npos) {
403 return std::string(input);
404 }
405 std::string dest;
406 dest.reserve(input.length());
407 dest.append(input.substr(0, first_percent));
408 const char* pointer = input.data() + first_percent;
409 const char* end = input.data() + input.size();
410 // Optimization opportunity: if the following code gets
411 // called often, it can be optimized quite a bit.
412 while (pointer < end) {
413 const char ch = pointer[0];
414 size_t remaining = end - pointer - 1;
415 if (ch != '%' || remaining < 2 ||
416 ( // ch == '%' && // It is unnecessary to check that ch == '%'.
417 (!is_ascii_hex_digit(pointer[1]) ||
418 !is_ascii_hex_digit(pointer[2])))) {
419 dest += ch;
420 pointer++;
421 } else {
422 unsigned a = convert_hex_to_binary(pointer[1]);
423 unsigned b = convert_hex_to_binary(pointer[2]);
424 char c = static_cast<char>(a * 16 + b);
425 dest += c;
426 pointer += 3;
427 }
428 }
429 return dest;
430}
431
432std::string percent_encode(const std::string_view input,
433 const uint8_t character_set[]) {
434 auto pointer = std::ranges::find_if(input, [character_set](const char c) {
435 return character_sets::bit_at(character_set, c);
436 });
437 // Optimization: Don't iterate if percent encode is not required
438 if (pointer == input.end()) {
439 return std::string(input);
440 }
441
442 std::string result;
443 result.reserve(input.length()); // in the worst case, percent encoding might
444 // produce 3 characters.
445 result.append(input.substr(0, std::distance(input.begin(), pointer)));
446
447 for (; pointer != input.end(); pointer++) {
448 if (character_sets::bit_at(character_set, *pointer)) {
449 result.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
450 } else {
451 result += *pointer;
452 }
453 }
454
455 return result;
456}
457
458template <bool append>
459bool percent_encode(const std::string_view input, const uint8_t character_set[],
460 std::string& out) {
461 ada_log("percent_encode ", input, " to output string while ",
462 append ? "appending" : "overwriting");
463 auto pointer = std::ranges::find_if(input, [character_set](const char c) {
464 return character_sets::bit_at(character_set, c);
465 });
466 ada_log("percent_encode done checking, moved to ",
467 std::distance(input.begin(), pointer));
468
469 // Optimization: Don't iterate if percent encode is not required
470 if (pointer == input.end()) {
471 ada_log("percent_encode encoding not needed.");
472 return false;
473 }
474 if constexpr (!append) {
475 out.clear();
476 }
477 ada_log("percent_encode appending ", std::distance(input.begin(), pointer),
478 " bytes");
479 // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
480 out.append(input.data(), std::distance(input.begin(), pointer));
481 ada_log("percent_encode processing ", std::distance(pointer, input.end()),
482 " bytes");
483 for (; pointer != input.end(); pointer++) {
484 if (character_sets::bit_at(character_set, *pointer)) {
485 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
486 } else {
487 out += *pointer;
488 }
489 }
490 return true;
491}
492
493bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
494 size_t first_percent) {
495 std::string percent_decoded_buffer;
496 std::string_view input = plain;
497 if (first_percent != std::string_view::npos) {
498 percent_decoded_buffer = unicode::percent_decode(plain, first_percent);
499 input = percent_decoded_buffer;
500 }
501 // input is a non-empty UTF-8 string, must be percent decoded
502 std::string idna_ascii = ada::idna::to_ascii(input);
503 if (idna_ascii.empty() || contains_forbidden_domain_code_point(
504 idna_ascii.data(), idna_ascii.size())) {
505 return false;
506 }
507 out = std::move(idna_ascii);
508 return true;
509}
510
511std::string percent_encode(const std::string_view input,
512 const uint8_t character_set[], size_t index) {
513 std::string out;
514 // NOLINTNEXTLINE(bugprone-suspicious-stringview-data-usage)
515 out.append(input.data(), index);
516 auto pointer = input.begin() + index;
517 for (; pointer != input.end(); pointer++) {
518 if (character_sets::bit_at(character_set, *pointer)) {
519 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
520 } else {
521 out += *pointer;
522 }
523 }
524 return out;
525}
526
527} // namespace ada::unicode
Definitions of the character sets used by unicode functions.
Declaration of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_PUSH_DISABLE_ALL_WARNINGS
Definition common_defs.h:90
#define ADA_POP_DISABLE_WARNINGS
#define ada_really_inline
Definition common_defs.h:81
ada_really_inline constexpr bool bit_at(const uint8_t a[], const uint8_t i)
constexpr char hex[1024]
std::string to_ascii(std::string_view ut8_string)
Includes the declarations for unicode operations.
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table
Definition unicode.cpp:208
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table_or_upper
Definition unicode.cpp:248
static constexpr char hex_to_binary_table[]
Definition unicode.cpp:391
constexpr uint64_t broadcast(uint8_t v) noexcept
Definition unicode.cpp:28
constexpr std::string_view table_is_double_dot_path_segment[]
Definition unicode.cpp:330
constexpr bool is_tabs_or_newline(char c) noexcept
Definition unicode.cpp:24
static constexpr std::array< uint8_t, 256 > is_forbidden_host_code_point_table
Definition unicode.cpp:193
static constexpr std::array< bool, 256 > is_alnum_plus_table
Definition unicode.cpp:289
tl::expected< result_type, ada::errors > result
Definitions for all unicode specific functions.