1#if ADA_INCLUDE_URL_PATTERN
8namespace ada::url_pattern_helpers {
10std::tuple<std::string, std::vector<std::string>>
11generate_regular_expression_and_name_list(
12 const std::vector<url_pattern_part>& part_list,
13 url_pattern_compile_component_options options) {
18 std::vector<std::string> name_list{};
21 for (
const url_pattern_part& part : part_list) {
23 if (part.type == url_pattern_part_type::FIXED_TEXT) {
25 if (part.modifier == url_pattern_part_modifier::none) {
28 result += escape_regexp_string(part.value);
36 result.append(escape_regexp_string(part.value));
41 result.append(convert_modifier_to_string(part.modifier));
50 name_list.push_back(part.name);
53 std::string regexp_value = part.value;
56 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD) {
59 regexp_value = generate_segment_wildcard_regexp(options);
62 else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
69 if (part.prefix.empty() && part.suffix.empty()) {
71 if (part.modifier == url_pattern_part_modifier::none ||
72 part.modifier == url_pattern_part_modifier::optional) {
74 result +=
"(" + regexp_value +
")" +
75 convert_modifier_to_string(part.modifier);
78 result +=
"((?:" + regexp_value +
")" +
79 convert_modifier_to_string(part.modifier) +
")";
85 if (part.modifier == url_pattern_part_modifier::none ||
86 part.modifier == url_pattern_part_modifier::optional) {
88 result +=
"(?:" + escape_regexp_string(part.prefix) +
"(" + regexp_value +
89 ")" + escape_regexp_string(part.suffix) +
")" +
90 convert_modifier_to_string(part.modifier);
95 ADA_ASSERT_TRUE(part.modifier == url_pattern_part_modifier::zero_or_more ||
96 part.modifier == url_pattern_part_modifier::one_or_more);
108 result.append(escape_regexp_string(part.prefix));
112 result.append(regexp_value);
117 result.append(escape_regexp_string(part.suffix));
120 result.append(escape_regexp_string(part.prefix));
124 result.append(regexp_value);
129 result.append(escape_regexp_string(part.suffix));
134 if (part.modifier == url_pattern_part_modifier::zero_or_more) {
143 return {std::move(result), std::move(name_list)};
146bool is_ipv6_address(std::string_view input)
noexcept {
148 if (input.size() < 2)
return false;
152 if (input.front() ==
'[')
return true;
155 if (input.starts_with(
"{["))
return true;
158 return input.starts_with(
"\\[");
161std::string convert_modifier_to_string(url_pattern_part_modifier modifier) {
165 case url_pattern_part_modifier::zero_or_more:
168 case url_pattern_part_modifier::optional:
171 case url_pattern_part_modifier::one_or_more:
179std::string generate_segment_wildcard_regexp(
180 url_pattern_compile_component_options options) {
182 std::string
result =
"[^";
185 result.append(escape_regexp_string(options.get_delimiter()));
189 ada_log(
"generate_segment_wildcard_regexp result: ", result);
193tl::expected<std::string, errors> canonicalize_protocol(
194 std::string_view input) {
195 ada_log(
"canonicalize_protocol called with input=", input);
197 if (input.empty()) [[unlikely]] {
202 if (input.ends_with(
":")) {
203 input.remove_suffix(1);
210 std::string(input) +
"://dummy.test",
nullptr)) {
213 auto protocol = dummy_url->get_protocol();
214 protocol.remove_suffix(1);
215 return std::string(protocol);
218 return tl::unexpected(errors::type_error);
221tl::expected<std::string, errors> canonicalize_username(
222 std::string_view input) {
224 if (input.empty()) [[unlikely]] {
231 if (!url->set_username(input)) {
232 return tl::unexpected(errors::type_error);
235 return std::string(url->get_username());
238tl::expected<std::string, errors> canonicalize_password(
239 std::string_view input) {
241 if (input.empty()) [[unlikely]] {
249 if (!url->set_password(input)) {
250 return tl::unexpected(errors::type_error);
253 return std::string(url->get_password());
256tl::expected<std::string, errors> canonicalize_hostname(
257 std::string_view input) {
258 ada_log(
"canonicalize_hostname input=", input);
260 if (input.empty()) [[unlikely]] {
272 if (!url->set_hostname(input)) {
274 return tl::unexpected(errors::type_error);
277 return std::string(url->get_hostname());
280tl::expected<std::string, errors> canonicalize_ipv6_hostname(
281 std::string_view input) {
282 ada_log(
"canonicalize_ipv6_hostname input=", input);
284 if (std::ranges::any_of(input, [](
char c) {
285 return c !=
'[' && c !=
']' && c !=
':' &&
286 !unicode::is_ascii_hex_digit(c);
288 return tl::unexpected(errors::type_error);
292 auto hostname = std::string(input);
293 unicode::to_lower_ascii(hostname.data(), hostname.size());
297tl::expected<std::string, errors> canonicalize_port(
298 std::string_view port_value) {
300 if (port_value.empty()) [[unlikely]] {
309 if (url->set_port(port_value)) {
311 return std::string(url->get_port());
314 return tl::unexpected(errors::type_error);
317tl::expected<std::string, errors> canonicalize_port_with_protocol(
318 std::string_view port_value, std::string_view protocol) {
320 if (port_value.empty()) [[unlikely]] {
327 if (protocol.empty()) {
329 }
else if (protocol.ends_with(
":")) {
330 protocol.remove_suffix(1);
341 if (url && url->set_port(port_value) && url->has_port()) {
343 return std::string(url->get_port());
347 if (scheme::is_special(protocol) && url->get_port().empty()) {
352 return tl::unexpected(errors::type_error);
355tl::expected<std::string, errors> canonicalize_pathname(
356 std::string_view input) {
358 if (input.empty()) [[unlikely]] {
363 const bool leading_slash = input.starts_with(
"/");
366 const auto modified_value = leading_slash ?
"" :
"/-";
367 const auto full_url =
368 std::string(
"fake://fake-url") + modified_value + std::string(input);
370 const auto pathname = url->get_pathname();
373 return leading_slash ? std::string(pathname)
374 : std::string(pathname.substr(2));
377 return tl::unexpected(errors::type_error);
380tl::expected<std::string, errors> canonicalize_opaque_pathname(
381 std::string_view input) {
383 if (input.empty()) [[unlikely]] {
393 return std::string(url->get_pathname());
396 return tl::unexpected(errors::type_error);
399tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
401 if (input.empty()) [[unlikely]] {
410 url->set_search(input);
411 if (url->has_search()) {
412 const auto search = url->get_search();
413 if (!search.empty()) {
414 return std::string(search.substr(1));
418 return tl::unexpected(errors::type_error);
421tl::expected<std::string, errors> canonicalize_hash(std::string_view input) {
423 if (input.empty()) [[unlikely]] {
432 url->set_hash(input);
434 if (url->has_hash()) {
435 const auto hash = url->get_hash();
437 return std::string(hash.substr(1));
441 return tl::unexpected(errors::type_error);
444tl::expected<std::vector<token>,
errors> tokenize(std::string_view input,
445 token_policy policy) {
446 ada_log(
"tokenize input: ", input);
450 auto tokenizer = Tokenizer(input, policy);
452 while (tokenizer.index < tokenizer.input.size()) {
455 tokenizer.seek_and_get_next_code_point(tokenizer.index);
458 if (tokenizer.code_point ==
'*') {
461 tokenizer.add_token_with_defaults(token_type::ASTERISK);
462 ada_log(
"add ASTERISK token");
468 if (tokenizer.code_point ==
'+' || tokenizer.code_point ==
'?') {
471 tokenizer.add_token_with_defaults(token_type::OTHER_MODIFIER);
477 if (tokenizer.code_point ==
'\\') {
480 if (tokenizer.index == tokenizer.input.size() - 1) {
483 if (
auto error = tokenizer.process_tokenizing_error(
484 tokenizer.next_index, tokenizer.index)) {
485 ada_log(
"process_tokenizing_error failed");
486 return tl::unexpected(*error);
492 auto escaped_index = tokenizer.next_index;
494 tokenizer.get_next_code_point();
497 tokenizer.add_token_with_default_length(
498 token_type::ESCAPED_CHAR, tokenizer.next_index, escaped_index);
499 ada_log(
"add ESCAPED_CHAR token on next_index ", tokenizer.next_index,
500 " with escaped index ", escaped_index);
506 if (tokenizer.code_point ==
'{') {
509 tokenizer.add_token_with_defaults(token_type::OPEN);
510 ada_log(
"add OPEN token");
515 if (tokenizer.code_point ==
'}') {
518 tokenizer.add_token_with_defaults(token_type::CLOSE);
519 ada_log(
"add CLOSE token");
524 if (tokenizer.code_point ==
':') {
526 auto name_position = tokenizer.next_index;
528 auto name_start = name_position;
530 while (name_position < tokenizer.input.size()) {
533 tokenizer.seek_and_get_next_code_point(name_position);
536 bool first_code_point = name_position == name_start;
539 auto valid_code_point =
540 idna::valid_name_code_point(tokenizer.code_point, first_code_point);
541 ada_log(
"tokenizer.code_point=", uint32_t(tokenizer.code_point),
542 " first_code_point=", first_code_point,
543 " valid_code_point=", valid_code_point);
545 if (!valid_code_point)
break;
547 name_position = tokenizer.next_index;
551 if (name_position <= name_start) {
554 if (
auto error = tokenizer.process_tokenizing_error(name_start,
556 ada_log(
"process_tokenizing_error failed");
557 return tl::unexpected(*error);
565 tokenizer.add_token_with_default_length(token_type::NAME, name_position,
571 if (tokenizer.code_point ==
'(') {
575 auto regexp_position = tokenizer.next_index;
577 auto regexp_start = regexp_position;
583 while (regexp_position < tokenizer.input.size()) {
586 tokenizer.seek_and_get_next_code_point(regexp_position);
591 if (!unicode::is_ascii(tokenizer.code_point)) {
594 if (
auto process_error = tokenizer.process_tokenizing_error(
595 regexp_start, tokenizer.index)) {
596 return tl::unexpected(*process_error);
605 if (regexp_position == regexp_start && tokenizer.code_point ==
'?') {
608 if (
auto process_error = tokenizer.process_tokenizing_error(
609 regexp_start, tokenizer.index)) {
610 return tl::unexpected(*process_error);
618 if (tokenizer.code_point ==
'\\') {
620 if (regexp_position == tokenizer.input.size() - 1) {
623 if (
auto process_error = tokenizer.process_tokenizing_error(
624 regexp_start, tokenizer.index)) {
625 return tl::unexpected(*process_error);
632 tokenizer.get_next_code_point();
635 if (!unicode::is_ascii(tokenizer.code_point)) {
638 if (
auto process_error = tokenizer.process_tokenizing_error(
639 regexp_start, tokenizer.index);
640 process_error.has_value()) {
641 return tl::unexpected(*process_error);
648 regexp_position = tokenizer.next_index;
653 if (tokenizer.code_point ==
')') {
659 regexp_position = tokenizer.next_index;
663 }
else if (tokenizer.code_point ==
'(') {
669 if (regexp_position == tokenizer.input.size() - 1) {
672 if (
auto process_error = tokenizer.process_tokenizing_error(
673 regexp_start, tokenizer.index)) {
674 return tl::unexpected(*process_error);
681 auto temporary_position = tokenizer.next_index;
683 tokenizer.get_next_code_point();
685 if (tokenizer.code_point !=
'?') {
688 if (
auto process_error = tokenizer.process_tokenizing_error(
689 regexp_start, tokenizer.index)) {
690 return tl::unexpected(*process_error);
697 tokenizer.next_index = temporary_position;
700 regexp_position = tokenizer.next_index;
709 if (
auto process_error = tokenizer.process_tokenizing_error(
710 regexp_start, tokenizer.index)) {
711 return tl::unexpected(*process_error);
716 auto regexp_length = regexp_position - regexp_start - 1;
718 if (regexp_length == 0) {
721 if (
auto process_error = tokenizer.process_tokenizing_error(
722 regexp_start, tokenizer.index)) {
723 ada_log(
"process_tokenizing_error failed");
724 return tl::unexpected(*process_error);
730 tokenizer.add_token(token_type::REGEXP, regexp_position, regexp_start,
736 tokenizer.add_token_with_defaults(token_type::CHAR);
740 tokenizer.add_token_with_default_length(token_type::END, tokenizer.index,
743 ada_log(
"tokenizer.token_list size is: ", tokenizer.token_list.size());
745 return tokenizer.token_list;
748std::string escape_pattern_string(std::string_view input) {
749 ada_log(
"escape_pattern_string called with input=", input);
750 if (input.empty()) [[unlikely]] {
757 result.reserve(input.size());
760 constexpr auto should_escape = [](
const char c) {
761 return c ==
'+' || c ==
'*' || c ==
'?' || c ==
':' || c ==
'{' ||
762 c ==
'}' || c ==
'(' || c ==
')' || c ==
'\\';
766 for (
const auto& c : input) {
767 if (should_escape(c)) {
780constexpr std::array<uint8_t, 256> escape_regexp_table = []()
consteval {
781 std::array<uint8_t, 256> out{};
782 for (
auto& c : {
'.',
'+',
'*',
'?',
'^',
'$',
'{',
'}',
'(',
')',
'[',
']',
789constexpr bool should_escape_regexp_char(
char c) {
790 return escape_regexp_table[(uint8_t)c];
794std::string escape_regexp_string(std::string_view input) {
799 result.reserve(input.size());
800 for (
const auto& c : input) {
802 if (should_escape_regexp_char(c)) {
803 result.append(std::string(
"\\") + c);
811std::string process_base_url_string(std::string_view input,
812 url_pattern_init::process_type type) {
814 if (type != url_pattern_init::process_type::pattern) {
815 return std::string(input);
818 return escape_pattern_string(input);
821constexpr bool is_absolute_pathname(
822 std::string_view input, url_pattern_init::process_type type)
noexcept {
824 if (input.empty()) [[unlikely]] {
828 if (input.starts_with(
"/"))
return true;
830 if (type == url_pattern_init::process_type::url)
return false;
832 if (input.size() < 2)
return false;
836 return input[1] ==
'/' && (input[0] ==
'\\' || input[0] ==
'{');
839std::string generate_pattern_string(
840 std::vector<url_pattern_part>& part_list,
841 url_pattern_compile_component_options& options) {
846 for (
size_t index = 0; index < part_list.size(); index++) {
848 auto part = part_list[index];
852 std::optional<url_pattern_part> previous_part =
853 index == 0 ? std::nullopt : std::optional(part_list[index - 1]);
856 std::optional<url_pattern_part> next_part =
857 index < part_list.size() - 1 ? std::optional(part_list[index + 1])
860 if (part.type == url_pattern_part_type::FIXED_TEXT) {
862 if (part.modifier == url_pattern_part_modifier::none) {
865 result.append(escape_pattern_string(part.value));
872 result.append(escape_pattern_string(part.value));
877 result.append(convert_modifier_to_string(part.modifier));
882 bool custom_name = !unicode::is_ascii_digit(part.name[0]);
888 bool needs_grouping =
889 !part.suffix.empty() ||
890 (!part.prefix.empty() && part.prefix[0] != options.get_prefix()[0]);
900 if (!needs_grouping && custom_name &&
901 part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
902 part.modifier == url_pattern_part_modifier::none &&
903 next_part.has_value() && next_part->prefix.empty() &&
904 next_part->suffix.empty()) {
906 if (next_part->type == url_pattern_part_type::FIXED_TEXT) {
910 if (idna::valid_name_code_point(next_part->value[0],
false)) {
911 needs_grouping =
true;
915 needs_grouping = !next_part->name.empty() &&
916 unicode::is_ascii_digit(next_part->name[0]);
927 if (!needs_grouping && part.prefix.empty() && previous_part.has_value() &&
928 previous_part->type == url_pattern_part_type::FIXED_TEXT &&
929 !options.get_prefix().empty() &&
930 previous_part->value.at(previous_part->value.size() - 1) ==
931 options.get_prefix()[0]) {
932 needs_grouping =
true;
939 if (needs_grouping) {
945 result.append(escape_pattern_string(part.prefix));
956 if (part.type == url_pattern_part_type::REGEXP) {
960 result.append(part.value);
963 }
else if (part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
970 result.append(generate_segment_wildcard_regexp(options));
973 }
else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
983 (!previous_part.has_value() ||
984 previous_part->type == url_pattern_part_type::FIXED_TEXT ||
985 previous_part->modifier != url_pattern_part_modifier::none ||
986 needs_grouping || !part.prefix.empty())) {
1003 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && custom_name &&
1004 !part.suffix.empty() &&
1005 idna::valid_name_code_point(part.suffix[0],
false)) {
1011 result.append(escape_pattern_string(part.suffix));
1013 if (needs_grouping)
result.append(
"}");
1016 result.append(convert_modifier_to_string(part.modifier));
#define ADA_ASSERT_TRUE(COND)
bool constexpr is_ascii(std::u32string_view view)
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
Declaration for the URLPattern helpers.