Prefer regex to manual parsing in parseURL:

Although `parseURL` used a regex to pull the authority out of the URL
being parsed, it performed manual parsing of the hostname and port.

This commit rolls the parsing of the username and password, if any,
directly into the regex. The hostname can be a name, an IPv4 or an
IPv6 address.

Fixes #2751
This commit is contained in:
John Freeman
2018-11-07 11:26:48 -08:00
committed by Nik Bougalis
parent c1a02440dc
commit dc4d76f626
3 changed files with 224 additions and 44 deletions

View File

@@ -70,6 +70,8 @@ struct parsedURL
explicit parsedURL() = default;
std::string scheme;
std::string username;
std::string password;
std::string domain;
boost::optional<std::uint16_t> port;
std::string path;

View File

@@ -94,9 +94,19 @@ bool parseUrl (parsedURL& pUrl, std::string const& strUrl)
// scheme://username:password@hostname:port/rest
static boost::regex reUrl (
"(?i)\\`\\s*"
"([[:alpha:]][-+.[:alpha:][:digit:]]*):" //scheme
"//([^/]+)?" // hostname
"(/.*)?" // path and parameters
// required scheme
"([[:alpha:]][-+.[:alpha:][:digit:]]*):"
// We choose to support only URIs whose `hier-part` has the form
// `"//" authority path-abempty`.
"//"
// optional userinfo
"(?:([^/]*?)(?::([^/]*?))?@)?"
// optional host
"([^/]*?)"
// optional port
"(?::([[:digit:]]+))?"
// optional path
"(/.*)?"
"\\s*?\\'");
boost::smatch smMatch;
@@ -106,29 +116,22 @@ bool parseUrl (parsedURL& pUrl, std::string const& strUrl)
{
pUrl.scheme = smMatch[1];
boost::algorithm::to_lower (pUrl.scheme);
pUrl.path = smMatch[3];
pUrl.domain = smMatch[2];
// now consider the domain/port fragment
auto colonPos = pUrl.domain.find_last_of(':');
if (colonPos != std::string::npos)
pUrl.username = smMatch[2];
pUrl.password = smMatch[3];
const std::string domain = smMatch[4];
// We need to use Endpoint to parse the domain to
// strip surrounding brackets from IPv6 addresses,
// e.g. [::1] => ::1.
const auto result {beast::IP::Endpoint::from_string_checked (domain)};
pUrl.domain = result.second
? result.first.address().to_string()
: domain;
const std::string port = smMatch[5];
if (!port.empty())
{
// use Endpoint class to see if this thing looks
// like an IP addr...
auto result {beast::IP::Endpoint::from_string_checked (pUrl.domain)};
if (result.second)
{
pUrl.domain = result.first.address().to_string();
pUrl.port = result.first.port();
}
else // otherwise we are DNS name + port
{
pUrl.port = beast::lexicalCast <std::uint16_t> (
pUrl.domain.substr(colonPos+1));
pUrl.domain = pUrl.domain.substr(0, colonPos);
}
pUrl.port = beast::lexicalCast <std::uint16_t> (port);
}
//else, the whole thing is domain, not port
pUrl.path = smMatch[6];
}
return bMatch;

View File

@@ -63,27 +63,202 @@ public:
{
testcase ("parseUrl");
parsedURL pUrl;
// Expected passes.
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain.empty());
BEAST_EXPECT(! pUrl.port);
// RFC 3986:
// > In general, a URI that uses the generic syntax for authority
// with an empty path should be normalized to a path of "/".
// Do we want to normalize paths?
BEAST_EXPECT(pUrl.path.empty());
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme:///"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain.empty());
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "lower://domain"));
BEAST_EXPECT(pUrl.scheme == "lower");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path.empty());
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "UPPER://domain:234/"));
BEAST_EXPECT(pUrl.scheme == "upper");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(*pUrl.port == 234);
BEAST_EXPECT(pUrl.path == "/");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "Mixed://domain/path"));
BEAST_EXPECT(pUrl.scheme == "mixed");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/path");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://[::1]:123/path"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "::1");
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.path == "/path");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://user:pass@domain:123/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username == "user");
BEAST_EXPECT(pUrl.password == "pass");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://user@domain:123/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username == "user");
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://:pass@domain:123/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password == "pass");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://domain:123/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://user:pass@domain/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username == "user");
BEAST_EXPECT(pUrl.password == "pass");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://user@domain/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username == "user");
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://:pass@domain/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password == "pass");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://domain/abc:321"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/abc:321");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme:///path/to/file"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain.empty());
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/path/to/file");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (
pUrl, "scheme://user:pass@domain/path/with/an@sign"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username == "user");
BEAST_EXPECT(pUrl.password == "pass");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/path/with/an@sign");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (
pUrl, "scheme://domain/path/with/an@sign"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "/path/with/an@sign");
}
{
parsedURL pUrl;
BEAST_EXPECT(parseUrl (pUrl, "scheme://:999/"));
BEAST_EXPECT(pUrl.scheme == "scheme");
BEAST_EXPECT(pUrl.username.empty());
BEAST_EXPECT(pUrl.password.empty());
BEAST_EXPECT(pUrl.domain.empty());
BEAST_EXPECT(*pUrl.port == 999);
BEAST_EXPECT(pUrl.path == "/");
}
BEAST_EXPECT(parseUrl (pUrl, "lower://domain"));
BEAST_EXPECT(pUrl.scheme == "lower");
BEAST_EXPECT(pUrl.domain == "domain");
BEAST_EXPECT(! pUrl.port);
BEAST_EXPECT(pUrl.path == "");
BEAST_EXPECT(parseUrl (pUrl, "UPPER://domain:234/"));
BEAST_EXPECT(pUrl.scheme == "upper");
BEAST_EXPECT(*pUrl.port == 234);
BEAST_EXPECT(pUrl.path == "/");
BEAST_EXPECT(parseUrl (pUrl, "Mixed://domain/path"));
BEAST_EXPECT(pUrl.scheme == "mixed");
BEAST_EXPECT(pUrl.path == "/path");
BEAST_EXPECT(parseUrl (pUrl, "scheme://[::1]:123/path"));
BEAST_EXPECT(*pUrl.port == 123);
BEAST_EXPECT(pUrl.domain == "::1");
BEAST_EXPECT(parseUrl(pUrl, "nodomain:///path/path/path"));
BEAST_EXPECT(pUrl.scheme == "nodomain");
BEAST_EXPECT(pUrl.domain.empty());
BEAST_EXPECT(pUrl.path == "/path/path/path");
// Expected fails.
{
parsedURL pUrl;
BEAST_EXPECT(! parseUrl (pUrl, ""));
BEAST_EXPECT(! parseUrl (pUrl, "nonsense"));
BEAST_EXPECT(! parseUrl (pUrl, "://"));
BEAST_EXPECT(! parseUrl (pUrl, ":///"));
}
}
void testToString ()