Commit 084f5526 authored by Harris Hancock's avatar Harris Hancock

Implement URL fragment, path, and userinfo component encode functions

According to the WHATWG URL spec, each different component of a URL gets its very own percent encode set, which we've been doing wrong this whole time.

In terms of reserved characters, the fragment set is a subset of the path set, which is a subset of the userinfo set, which is a subset of RFC 2396's reserved set.
parent b6ad7f33
...@@ -273,12 +273,18 @@ KJ_TEST("hex encoding/decoding") { ...@@ -273,12 +273,18 @@ KJ_TEST("hex encoding/decoding") {
expectRes(decodeHex("1234xbf2"), bytes, true); expectRes(decodeHex("1234xbf2"), bytes, true);
} }
constexpr char RFC2396_FRAGMENT_SET_DIFF[] = "#$&+,/:;=?@[\\]^{|}";
// These are the characters reserved in RFC 2396, but not in the fragment percent encode set.
KJ_TEST("URI encoding/decoding") { KJ_TEST("URI encoding/decoding") {
KJ_EXPECT(encodeUriComponent("foo") == "foo"); KJ_EXPECT(encodeUriComponent("foo") == "foo");
KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar"); KJ_EXPECT(encodeUriComponent("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriComponent("\xab\xba") == "%AB%BA"); KJ_EXPECT(encodeUriComponent("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar"); KJ_EXPECT(encodeUriComponent(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriComponent(RFC2396_FRAGMENT_SET_DIFF) ==
"%23%24%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D");
// Encode characters reserved by application/x-www-form-urlencoded, but not by RFC 2396. // Encode characters reserved by application/x-www-form-urlencoded, but not by RFC 2396.
KJ_EXPECT(encodeUriComponent("'foo'! (~)") == "'foo'!%20(~)"); KJ_EXPECT(encodeUriComponent("'foo'! (~)") == "'foo'!%20(~)");
...@@ -304,6 +310,32 @@ KJ_TEST("URI encoding/decoding") { ...@@ -304,6 +310,32 @@ KJ_TEST("URI encoding/decoding") {
} }
} }
KJ_TEST("URL component encoding") {
KJ_EXPECT(encodeUriFragment("foo") == "foo");
KJ_EXPECT(encodeUriFragment("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriFragment("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriFragment(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriFragment(RFC2396_FRAGMENT_SET_DIFF) == RFC2396_FRAGMENT_SET_DIFF);
KJ_EXPECT(encodeUriPath("foo") == "foo");
KJ_EXPECT(encodeUriPath("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriPath("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriPath(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriPath(RFC2396_FRAGMENT_SET_DIFF) == "%23$&+,%2F:;=%3F@[%5C]^%7B|%7D");
KJ_EXPECT(encodeUriUserInfo("foo") == "foo");
KJ_EXPECT(encodeUriUserInfo("foo bar") == "foo%20bar");
KJ_EXPECT(encodeUriUserInfo("\xab\xba") == "%AB%BA");
KJ_EXPECT(encodeUriUserInfo(StringPtr("foo\0bar", 7)) == "foo%00bar");
KJ_EXPECT(encodeUriUserInfo(RFC2396_FRAGMENT_SET_DIFF) ==
"%23$&+,%2F%3A%3B%3D%3F%40%5B%5C%5D%5E%7B%7C%7D");
// NOTE: None of these functions have explicit decode equivalents.
}
KJ_TEST("application/x-www-form-urlencoded encoding/decoding") { KJ_TEST("application/x-www-form-urlencoded encoding/decoding") {
KJ_EXPECT(encodeWwwForm("foo") == "foo"); KJ_EXPECT(encodeWwwForm("foo") == "foo");
KJ_EXPECT(encodeWwwForm("foo bar") == "foo+bar"); KJ_EXPECT(encodeWwwForm("foo bar") == "foo+bar");
......
...@@ -390,7 +390,9 @@ EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) { ...@@ -390,7 +390,9 @@ EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text) {
String encodeUriComponent(ArrayPtr<const byte> bytes) { String encodeUriComponent(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1); Vector<char> result(bytes.size() + 1);
for (byte b: bytes) { for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') || if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' || b == '-' || b == '_' || b == '.' || b == '!' || b == '~' || b == '*' || b == '\'' ||
b == '(' || b == ')') { b == '(' || b == ')') {
result.add(b); result.add(b);
...@@ -404,10 +406,68 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) { ...@@ -404,10 +406,68 @@ String encodeUriComponent(ArrayPtr<const byte> bytes) {
return String(result.releaseAsArray()); return String(result.releaseAsArray());
} }
String encodeUriFragment(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('?' <= b && b <= '_') || // covers A-Z
('a' <= b && b <= '~') || // covers a-z
('#' <= b && b <= ';') || // covers 0-9
b == '!' || b == '=') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeUriPath(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('@' <= b && b <= '[') || // covers A-Z
('a' <= b && b <= 'z') ||
('0' <= b && b <= ';') || // covers 0-9
('$' <= b && b <= '.') ||
b == '_' || b == '!' || b == '=' || b == ']' || b == '^' || b == '|' || b == '~') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeUriUserInfo(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1);
for (byte b: bytes) {
if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
('$' <= b && b <= '.') ||
b == '_' || b == '!' || b == '~') {
result.add(b);
} else {
result.add('%');
result.add(HEX_DIGITS_URI[b/16]);
result.add(HEX_DIGITS_URI[b%16]);
}
}
result.add('\0');
return String(result.releaseAsArray());
}
String encodeWwwForm(ArrayPtr<const byte> bytes) { String encodeWwwForm(ArrayPtr<const byte> bytes) {
Vector<char> result(bytes.size() + 1); Vector<char> result(bytes.size() + 1);
for (byte b: bytes) { for (byte b: bytes) {
if (('A' <= b && b <= 'Z') || ('a' <= b && b <= 'z') || ('0' <= b && b <= '9') || if (('A' <= b && b <= 'Z') ||
('a' <= b && b <= 'z') ||
('0' <= b && b <= '9') ||
b == '-' || b == '_' || b == '.' || b == '*') { b == '-' || b == '_' || b == '.' || b == '*') {
result.add(b); result.add(b);
} else if (b == ' ') { } else if (b == ' ') {
......
...@@ -130,6 +130,33 @@ EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text); ...@@ -130,6 +130,33 @@ EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
// //
// See https://tools.ietf.org/html/rfc2396#section-2.3 // See https://tools.ietf.org/html/rfc2396#section-2.3
String encodeUriFragment(ArrayPtr<const byte> bytes);
String encodeUriFragment(ArrayPtr<const char> bytes);
// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
//
// See https://url.spec.whatwg.org/#fragment-percent-encode-set
String encodeUriPath(ArrayPtr<const byte> bytes);
String encodeUriPath(ArrayPtr<const char> bytes);
// Encode URL path components (not entire paths!) using the path percent encode set defined by the
// WHATWG URL specification. Use decodeUriComponent() to decode.
//
// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
// defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
// function on individual path components, and never entire paths, augmenting the character set to
// include these separators allows this function to be used to implement a URL class that stores
// its path components in either percent-encoded OR percent-decoded form.
//
// See https://url.spec.whatwg.org/#path-percent-encode-set
String encodeUriUserInfo(ArrayPtr<const byte> bytes);
String encodeUriUserInfo(ArrayPtr<const char> bytes);
// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
// specification. Use decodeUriComponent() to decode.
//
// See https://url.spec.whatwg.org/#userinfo-percent-encode-set
String encodeWwwForm(ArrayPtr<const byte> bytes); String encodeWwwForm(ArrayPtr<const byte> bytes);
String encodeWwwForm(ArrayPtr<const char> bytes); String encodeWwwForm(ArrayPtr<const char> bytes);
EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text); EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
...@@ -215,6 +242,16 @@ inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) { ...@@ -215,6 +242,16 @@ inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
return { String(result.releaseAsChars()), result.hadErrors }; return { String(result.releaseAsChars()), result.hadErrors };
} }
inline String encodeUriFragment(ArrayPtr<const char> text) {
return encodeUriFragment(text.asBytes());
}
inline String encodeUriPath(ArrayPtr<const char> text) {
return encodeUriPath(text.asBytes());
}
inline String encodeUriUserInfo(ArrayPtr<const char> text) {
return encodeUriUserInfo(text.asBytes());
}
inline String encodeWwwForm(ArrayPtr<const char> text) { inline String encodeWwwForm(ArrayPtr<const char> text) {
return encodeWwwForm(text.asBytes()); return encodeWwwForm(text.asBytes());
} }
...@@ -278,6 +315,18 @@ inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) { ...@@ -278,6 +315,18 @@ inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
return decodeUriComponent(arrayPtr(text, s-1)); return decodeUriComponent(arrayPtr(text, s-1));
} }
template <size_t s> template <size_t s>
inline String encodeUriFragment(const char (&text)[s]) {
return encodeUriFragment(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriPath(const char (&text)[s]) {
return encodeUriPath(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeUriUserInfo(const char (&text)[s]) {
return encodeUriUserInfo(arrayPtr(text, s - 1));
}
template <size_t s>
inline String encodeWwwForm(const char (&text)[s]) { inline String encodeWwwForm(const char (&text)[s]) {
return encodeWwwForm(arrayPtr(text, s - 1)); return encodeWwwForm(arrayPtr(text, s - 1));
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment