diff --git a/c++/src/kj/compat/url-test.c++ b/c++/src/kj/compat/url-test.c++ index 62f459a4ebc4d808a25cd4c6b1174c4b181c8f7d..d5a2d05627b5e764fbe1ea4ea9649e5e462d3c98 100644 --- a/c++/src/kj/compat/url-test.c++ +++ b/c++/src/kj/compat/url-test.c++ @@ -400,6 +400,58 @@ KJ_TEST("URL for HTTP request") { KJ_EXPECT(url.query[1].name == "corge"); KJ_EXPECT(url.query[1].value == nullptr); } + + { + // '#' is allowed in path components in the HTTP_REQUEST context. + Url url = Url::parse("/foo#bar", Url::HTTP_REQUEST); + KJ_EXPECT(url.toString(Url::HTTP_REQUEST) == "/foo%23bar"); + KJ_EXPECT(url.scheme == nullptr); + KJ_EXPECT(url.host == nullptr); + KJ_EXPECT(url.path.asPtr() == kj::ArrayPtr<const StringPtr>{"foo#bar"}); + KJ_EXPECT(!url.hasTrailingSlash); + KJ_EXPECT(url.query == nullptr); + KJ_EXPECT(url.fragment == nullptr); + } + + { + // '#' is allowed in path components in the HTTP_PROXY_REQUEST context. + Url url = Url::parse("https://capnproto.org/foo#bar", Url::HTTP_PROXY_REQUEST); + KJ_EXPECT(url.toString(Url::HTTP_PROXY_REQUEST) == "https://capnproto.org/foo%23bar"); + KJ_EXPECT(url.scheme == "https"); + KJ_EXPECT(url.host == "capnproto.org"); + KJ_EXPECT(url.path.asPtr() == kj::ArrayPtr<const StringPtr>{"foo#bar"}); + KJ_EXPECT(!url.hasTrailingSlash); + KJ_EXPECT(url.query == nullptr); + KJ_EXPECT(url.fragment == nullptr); + } + + { + // '#' is allowed in query components in the HTTP_REQUEST context. + Url url = Url::parse("/?foo=bar#123", Url::HTTP_REQUEST); + KJ_EXPECT(url.toString(Url::HTTP_REQUEST) == "/?foo=bar%23123"); + KJ_EXPECT(url.scheme == nullptr); + KJ_EXPECT(url.host == nullptr); + KJ_EXPECT(url.path == nullptr); + KJ_EXPECT(url.hasTrailingSlash); + KJ_ASSERT(url.query.size() == 1); + KJ_EXPECT(url.query[0].name == "foo"); + KJ_EXPECT(url.query[0].value == "bar#123"); + KJ_EXPECT(url.fragment == nullptr); + } + + { + // '#' is allowed in query components in the HTTP_PROXY_REQUEST context. + Url url = Url::parse("https://capnproto.org/?foo=bar#123", Url::HTTP_PROXY_REQUEST); + KJ_EXPECT(url.toString(Url::HTTP_PROXY_REQUEST) == "https://capnproto.org/?foo=bar%23123"); + KJ_EXPECT(url.scheme == "https"); + KJ_EXPECT(url.host == "capnproto.org"); + KJ_EXPECT(url.path == nullptr); + KJ_EXPECT(url.hasTrailingSlash); + KJ_ASSERT(url.query.size() == 1); + KJ_EXPECT(url.query[0].name == "foo"); + KJ_EXPECT(url.query[0].value == "bar#123"); + KJ_EXPECT(url.fragment == nullptr); + } } KJ_TEST("parse URL failure") { @@ -413,9 +465,8 @@ KJ_TEST("parse URL failure") { // components not valid in context KJ_EXPECT(Url::tryParse("https://capnproto.org/foo", Url::HTTP_REQUEST) == nullptr); - KJ_EXPECT(Url::tryParse("/foo#bar", Url::HTTP_REQUEST) == nullptr); KJ_EXPECT(Url::tryParse("https://bob:123@capnproto.org/foo", Url::HTTP_PROXY_REQUEST) == nullptr); - KJ_EXPECT(Url::tryParse("https://capnproto.org/foo#bar", Url::HTTP_PROXY_REQUEST) == nullptr); + KJ_EXPECT(Url::tryParse("https://capnproto.org#/foo", Url::HTTP_PROXY_REQUEST) == nullptr); } void parseAndCheckRelative(kj::StringPtr base, kj::StringPtr relative, kj::StringPtr expected, diff --git a/c++/src/kj/compat/url.c++ b/c++/src/kj/compat/url.c++ index baf07d18d729dce09ac0cbc1f733a32dec849bb0..cfbb669e06c3240a981a2c1f8253bb692ecbd73c 100644 --- a/c++/src/kj/compat/url.c++ +++ b/c++/src/kj/compat/url.c++ @@ -31,9 +31,40 @@ namespace { constexpr auto ALPHAS = parse::charRange('a', 'z').orRange('A', 'Z'); constexpr auto DIGITS = parse::charRange('0', '9'); + constexpr auto END_AUTHORITY = parse::anyOfChars("/?#"); -constexpr auto END_PATH_PART = parse::anyOfChars("/?#"); -constexpr auto END_QUERY_PART = parse::anyOfChars("&#"); + +// Authority, path, and query components can typically be terminated by the start of a fragment. +// However, fragments are disallowed in HTTP_REQUEST and HTTP_PROXY_REQUEST contexts. As a quirk, we +// allow the fragment start character ('#') to live unescaped in path and query components. We do +// not currently allow it in the authority component, because our parser would reject it as a host +// character anyway. + +const parse::CharGroup_& getEndPathPart(Url::Context context) { + static constexpr auto END_PATH_PART_HREF = parse::anyOfChars("/?#"); + static constexpr auto END_PATH_PART_REQUEST = parse::anyOfChars("/?"); + + switch (context) { + case Url::REMOTE_HREF: return END_PATH_PART_HREF; + case Url::HTTP_PROXY_REQUEST: return END_PATH_PART_REQUEST; + case Url::HTTP_REQUEST: return END_PATH_PART_REQUEST; + } + + KJ_UNREACHABLE; +} + +const parse::CharGroup_& getEndQueryPart(Url::Context context) { + static constexpr auto END_QUERY_PART_HREF = parse::anyOfChars("&#"); + static constexpr auto END_QUERY_PART_REQUEST = parse::anyOfChars("&"); + + switch (context) { + case Url::REMOTE_HREF: return END_QUERY_PART_HREF; + case Url::HTTP_PROXY_REQUEST: return END_QUERY_PART_REQUEST; + case Url::HTTP_REQUEST: return END_QUERY_PART_REQUEST; + } + + KJ_UNREACHABLE; +} constexpr auto SCHEME_CHARS = ALPHAS.orGroup(DIGITS).orAny("+-."); constexpr auto NOT_SCHEME_CHARS = SCHEME_CHARS.invert(); @@ -139,6 +170,9 @@ Maybe<Url> Url::tryParse(StringPtr text, Context context, Options options) { result.options = options; bool err = false; // tracks percent-decoding errors + auto& END_PATH_PART = getEndPathPart(context); + auto& END_QUERY_PART = getEndQueryPart(context); + if (context == HTTP_REQUEST) { if (!text.startsWith("/")) { return nullptr; @@ -251,6 +285,9 @@ Maybe<Url> Url::tryParseRelative(StringPtr text) const { result.options = options; bool err = false; // tracks percent-decoding errors + auto& END_PATH_PART = getEndPathPart(Url::REMOTE_HREF); + auto& END_QUERY_PART = getEndQueryPart(Url::REMOTE_HREF); + // scheme { bool gotScheme = false;