Unverified Commit d08ec534 authored by Kenton Varda's avatar Kenton Varda Committed by GitHub

Merge pull request #650 from capnproto/harris/use-path-percent-encode-set-in-url-parser

Implement various WHATWG URL percent encoding functions
parents b6ad7f33 c2ff2eb3
......@@ -198,11 +198,11 @@ KJ_TEST("parse / stringify URL") {
}
{
auto url = parseAndCheck("https://foo:1234@capnproto.org");
auto url = parseAndCheck("https://$foo&:12+,34@capnproto.org");
KJ_EXPECT(url.scheme == "https");
auto& user = KJ_ASSERT_NONNULL(url.userInfo);
KJ_EXPECT(user.username == "foo");
KJ_EXPECT(KJ_ASSERT_NONNULL(user.password) == "1234");
KJ_EXPECT(user.username == "$foo&");
KJ_EXPECT(KJ_ASSERT_NONNULL(user.password) == "12+,34");
KJ_EXPECT(url.host == "capnproto.org");
KJ_EXPECT(url.path == nullptr);
KJ_EXPECT(!url.hasTrailingSlash);
......@@ -221,6 +221,11 @@ KJ_TEST("parse / stringify URL") {
KJ_EXPECT(url.fragment == nullptr);
}
{
auto url = parseAndCheck("https://capnproto.org/foo%2Fbar/baz");
KJ_EXPECT(url.path.asPtr() == kj::ArrayPtr<const StringPtr>({"foo/bar", "baz"}));
}
parseAndCheck("https://capnproto.org/foo/bar?", "https://capnproto.org/foo/bar");
parseAndCheck("https://capnproto.org/foo/bar?#", "https://capnproto.org/foo/bar#");
parseAndCheck("https://capnproto.org/foo/bar#");
......@@ -241,6 +246,15 @@ KJ_TEST("URL percent encoding") {
parseAndCheck(
"https://b b: bcd@capnproto.org/f o?b r=b z#q x",
"https://b%20b:%20bcd@capnproto.org/f%20o?b+r=b+z#q%20x");
parseAndCheck(
"https://capnproto.org/foo?bar=baz#@?#^[\\]{|}",
"https://capnproto.org/foo?bar=baz#@?#^[\\]{|}");
// All permissible non-alphanumeric, non-separator path characters.
parseAndCheck(
"https://capnproto.org/!$&'()*+,-.:;=@[]^_|~",
"https://capnproto.org/!$&'()*+,-.:;=@[]^_|~");
}
KJ_TEST("URL relative paths") {
......@@ -368,7 +382,7 @@ KJ_TEST("parse relative URL") {
"http://capnproto.org/grault");
parseAndCheckRelative("https://capnproto.org/foo/bar?baz=qux#corge",
"/http:/grault",
"https://capnproto.org/http%3A/grault");
"https://capnproto.org/http:/grault");
parseAndCheckRelative("https://capnproto.org/",
"/foo/../bar",
"https://capnproto.org/bar");
......
......@@ -375,10 +375,10 @@ String Url::toString(Context context) const {
if (context == REMOTE_HREF) {
KJ_IF_MAYBE(user, userInfo) {
chars.addAll(encodeUriComponent(user->username));
chars.addAll(encodeUriUserInfo(user->username));
KJ_IF_MAYBE(pass, user->password) {
chars.add(':');
chars.addAll(encodeUriComponent(*pass));
chars.addAll(encodeUriUserInfo(*pass));
}
chars.add('@');
}
......@@ -407,7 +407,7 @@ String Url::toString(Context context) const {
continue;
}
chars.add('/');
chars.addAll(encodeUriComponent(pathPart));
chars.addAll(encodeUriPath(pathPart));
}
if (hasTrailingSlash || (path.size() == 0 && context == HTTP_REQUEST)) {
chars.add('/');
......@@ -427,7 +427,7 @@ String Url::toString(Context context) const {
if (context == REMOTE_HREF) {
KJ_IF_MAYBE(f, fragment) {
chars.add('#');
chars.addAll(encodeUriComponent(*f));
chars.addAll(encodeUriFragment(*f));
}
}
......
......@@ -273,12 +273,18 @@ KJ_TEST("hex encoding/decoding") {
expectRes(decodeHex("1234xbf2"), bytes, true);
}
constexpr char RFC2396_FRAGMENT_SET_DIFF[] = "#$&+,/:;=?@[\\]^{|}";
// These are the characters reserved in RFC 2396, but not in the fragment percent encode set.
KJ_TEST("URI encoding/decoding") {
KJ_EXPECT(encodeUriComponent("foo") == "foo");
KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriComponent("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriComponent(RFC2396_FRAGMENT_SET_DIFF) ==
"%23%24%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D");
// Encode characters reserved by application/x-www-form-urlencoded, but not by RFC 2396.
KJ_EXPECT(encodeUriComponent("'foo'! (~)") == "'foo'!%20(~)");
......@@ -304,6 +310,32 @@ KJ_TEST("URI encoding/decoding") {
}
}
KJ_TEST("URL component encoding") {
KJ_EXPECT(encodeUriFragment("foo") == "foo");
KJ_EXPECT(encodeUriFragment("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriFragment("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriFragment(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriFragment(RFC2396_FRAGMENT_SET_DIFF) == RFC2396_FRAGMENT_SET_DIFF);
KJ_EXPECT(encodeUriPath("foo") == "foo");
KJ_EXPECT(encodeUriPath("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriPath("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriPath(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriPath(RFC2396_FRAGMENT_SET_DIFF) == "%23$&+,%2F:;=%3F@[%5C]^%7B|%7D");
KJ_EXPECT(encodeUriUserInfo("foo") == "foo");
KJ_EXPECT(encodeUriUserInfo("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriUserInfo("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriUserInfo(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriUserInfo(RFC2396_FRAGMENT_SET_DIFF) ==
"%23$&+,%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D");
// NOTE: None of these functions have explicit decode equivalents.
}
KJ_TEST("application/x-www-form-urlencoded encoding/decoding") {
KJ_EXPECT(encodeWwwForm("foo") == "foo");
KJ_EXPECT(encodeWwwForm("foo bar") == "foo+bar");
......
......@@ -390,7 +390,9 @@ EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
String encodeUriComponent(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') ||
if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
b == '(' || b == ')') {
result.add(b);
......@@ -404,10 +406,68 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) {
return String(result.releaseAsArray());
}
String encodeUriFragment(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('?' <= b && b <= '_') || // covers A-Z
('a' <= b && b <= '~') || // covers a-z
('#' <= b && b <= ';') || // covers 0-9
b == '!' || b == '=') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeUriPath(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('@' <= b && b <= '[') || // covers A-Z
('a' <= b && b <= 'z') ||
('0' <= b && b <= ';') || // covers 0-9
('$' <= b && b <= '.') ||
b == '_' || b == '!' || b == '=' || b == ']' || b == '^' || b == '|' || b == '~') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeUriUserInfo(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
('$' <= b && b <= '.') ||
b == '_' || b == '!' || b == '~') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeWwwForm(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') ||
if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '*') {
result.add(b);
} else if (b == ' ') {
......
......@@ -130,6 +130,33 @@ EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
//
// See https://tools.ietf.org/html/rfc2396#section-2.3
String encodeUriFragment(ArrayPtr<const byte> bytes);
String encodeUriFragment(ArrayPtr<const char> bytes);
// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
//
// See https://url.spec.whatwg.org/#fragment-percent-encode-set
String encodeUriPath(ArrayPtr<const byte> bytes);
String encodeUriPath(ArrayPtr<const char> bytes);
// Encode URL path components (not entire paths!) using the path percent encode set defined by the
// WHATWG URL specification. Use decodeUriComponent() to decode.
//
// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
// defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
// function on individual path components, and never entire paths, augmenting the character set to
// include these separators allows this function to be used to implement a URL class that stores
// its path components in either percent-encoded OR percent-decoded form.
//
// See https://url.spec.whatwg.org/#path-percent-encode-set
String encodeUriUserInfo(ArrayPtr<const byte> bytes);
String encodeUriUserInfo(ArrayPtr<const char> bytes);
// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
//
// See https://url.spec.whatwg.org/#userinfo-percent-encode-set
String encodeWwwForm(ArrayPtr<const byte> bytes);
String encodeWwwForm(ArrayPtr<const char> bytes);
EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
......@@ -215,6 +242,16 @@ inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
return { String(result.releaseAsChars()), result.hadErrors };
}
inline String encodeUriFragment(ArrayPtr<const char> text) {
return encodeUriFragment(text.asBytes());
}
inline String encodeUriPath(ArrayPtr<const char> text) {
return encodeUriPath(text.asBytes());
}
inline String encodeUriUserInfo(ArrayPtr<const char> text) {
return encodeUriUserInfo(text.asBytes());
}
inline String encodeWwwForm(ArrayPtr<const char> text) {
return encodeWwwForm(text.asBytes());
}
......@@ -278,6 +315,18 @@ inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
return decodeUriComponent(arrayPtr(text, s-1));
}
template <size_t s>
inline String encodeUriFragment(const char (&text)[s]) {
return encodeUriFragment(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriPath(const char (&text)[s]) {
return encodeUriPath(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriUserInfo(const char (&text)[s]) {
return encodeUriUserInfo(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeWwwForm(const char (&text)[s]) {
return encodeWwwForm(arrayPtr(text, s - 1));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment