reimu/htmlcxx/html/Uri.cc
2018-07-10 13:54:56 +03:00

574 lines
15 KiB
C++

#include "Uri.h"
#include "wincstring.h"
#include <strstream>
#include <cstdlib>
#include <cassert>
#include "tld.h"
//#define DEBUG
#include "debug.h"
using namespace std;
using namespace htmlcxx;
/** Structure to store various schemes and their default ports */
struct schemes_t {
/** The name of the scheme */
const char *name;
/** The default port for the scheme */
unsigned int default_port;
};
/* Some WWW schemes and their default ports; this is basically /etc/services */
/* This will become global when the protocol abstraction comes */
/* As the schemes are searched by a linear search, */
/* they are sorted by their expected frequency */
static schemes_t schemes[] =
{
{"http", Uri::URI_HTTP_DEFAULT_PORT},
{"ftp", Uri::URI_FTP_DEFAULT_PORT},
{"https", Uri::URI_HTTPS_DEFAULT_PORT},
{"gopher", Uri::URI_GOPHER_DEFAULT_PORT},
{"ldap", Uri::URI_LDAP_DEFAULT_PORT},
{"nntp", Uri::URI_NNTP_DEFAULT_PORT},
{"snews", Uri::URI_SNEWS_DEFAULT_PORT},
{"imap", Uri::URI_IMAP_DEFAULT_PORT},
{"pop", Uri::URI_POP_DEFAULT_PORT},
{"sip", Uri::URI_SIP_DEFAULT_PORT},
{"rtsp", Uri::URI_RTSP_DEFAULT_PORT},
{"wais", Uri::URI_WAIS_DEFAULT_PORT},
{"z39.50r", Uri::URI_WAIS_DEFAULT_PORT},
{"z39.50s", Uri::URI_WAIS_DEFAULT_PORT},
{"prospero", Uri::URI_PROSPERO_DEFAULT_PORT},
{"nfs", Uri::URI_NFS_DEFAULT_PORT},
{"tip", Uri::URI_TIP_DEFAULT_PORT},
{"acap", Uri::URI_ACAP_DEFAULT_PORT},
{"telnet", Uri::URI_TELNET_DEFAULT_PORT},
{"ssh", Uri::URI_SSH_DEFAULT_PORT},
{ NULL, 0xFFFF } /* unknown port */
};
static unsigned int port_of_Scheme(const char *scheme_str)
{
schemes_t *scheme;
if (scheme_str) {
for (scheme = schemes; scheme->name != NULL; ++scheme) {
if (strcasecmp(scheme_str, scheme->name) == 0) {
return scheme->default_port;
}
}
}
return 0;
}
/* We have a apr_table_t that we can index by character and it tells us if the
* character is one of the interesting delimiters. Note that we even get
* compares for NUL for free -- it's just another delimiter.
*/
#define T_COLON 0x01 /* ':' */
#define T_SLASH 0x02 /* '/' */
#define T_QUESTION 0x04 /* '?' */
#define T_HASH 0x08 /* '#' */
#define T_NUL 0x80 /* '\0' */
/* the uri_delims.h file is autogenerated by gen_uri_delims.c */
/* this file is automatically generated by gen_uri_delims, do not edit */
static const unsigned char uri_delims[256] = {
T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0,
0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0,
0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
/* it works like this:
if (uri_delims[ch] & NOTEND_foobar) {
then we're not at a delimiter for foobar
}
*/
/* Note that we optimize the scheme scanning here, we cheat and let the
* compiler know that it doesn't have to do the & masking.
*/
#define NOTEND_SCHEME (0xff)
#define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL)
#define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL)
static size_t wwwPrefixOffset(const std::string& hostname);
Uri::Uri()
: mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
{}
Uri::Uri(const string &uri_str)
: mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
{
init(uri_str);
}
void Uri::init(const string &uri_str)
{
DEBUGP("Parsing uri %s\n", uri_str.c_str());
if(uri_str.empty()) return;
const char *uri = uri_str.c_str();
const char *s;
const char *s1;
const char *hostinfo;
char *endstr;
/* We assume the processor has a branch predictor like most --
* it assumes forward branches are untaken and backwards are taken. That's
* the reason for the gotos. -djg
*/
if (uri[0] == '/') {
deal_with_path:
DEBUGP("Dealing with path\n");
/* we expect uri to point to first character of path ... remember
* that the path could be empty -- http://foobar?query for example
*/
s = uri;
while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
++s;
}
if (s != uri) {
mPath.assign(uri, s - uri);
DEBUGP("Path is %s\n", mPath.c_str());
}
if (*s == 0) {
return;
}
if (*s == '?') {
++s;
s1 = strchr(s, '#');
if (s1) {
mFragment.assign(s1 + 1);
mExistsFragment = true;
DEBUGP("Fragment is %s\n", mFragment.c_str());
mQuery.assign(s, s1 - s);
mExistsQuery = true;
DEBUGP("Query is %s\n", mQuery.c_str());
}
else {
mQuery.assign(s);
mExistsQuery = true;
DEBUGP("Query is %s\n", mQuery.c_str());
}
return;
}
/* otherwise it's a fragment */
mFragment.assign(s + 1);
mExistsFragment = true;
DEBUGP("Fragment is %s\n", mFragment.c_str());
return;
}
DEBUGP("Dealing with scheme\n");
/* find the scheme: */
if (!isalpha(*uri)) goto deal_with_path;
s = uri;
while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
++s;
}
/* scheme must be non-empty and followed by :// */
if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {
goto deal_with_path; /* backwards predicted taken! */
}
mScheme.assign(uri, s - uri);
DEBUGP("Scheme is %s\n", mScheme.c_str());
s += 3;
DEBUGP("Finding hostinfo\n");
hostinfo = s;
DEBUGP("Hostinfo is %s\n", hostinfo);
while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
++s;
}
uri = s; /* whatever follows hostinfo is start of uri */
// mHostinfo.assign(hostinfo, uri - hostinfo);
/* If there's a username:password@host:port, the @ we want is the last @...
* too bad there's no memrchr()... For the C purists, note that hostinfo
* is definately not the first character of the original uri so therefore
* &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
*/
do {
--s;
} while (s >= hostinfo && *s != '@');
if (s < hostinfo) {
/* again we want the common case to be fall through */
deal_with_host:
DEBUGP("Dealing with host\n");
/* We expect hostinfo to point to the first character of
* the hostname. If there's a port it is the first colon.
*/
s = (char *)memchr(hostinfo, ':', uri - hostinfo);
if (s == NULL) {
/* we expect the common case to have no port */
mHostname.assign(hostinfo, uri - hostinfo);
DEBUGP("Hostname is %s\n", mHostname.c_str());
goto deal_with_path;
}
mHostname.assign(hostinfo, s - hostinfo);
DEBUGP("Hostname is %s\n", mHostname.c_str());
++s;
if (uri != s) {
mPortStr.assign(s, uri - s);
mPort = strtol(mPortStr.c_str(), &endstr, 10);
if (*endstr == '\0') {
goto deal_with_path;
}
/* Invalid characters after ':' found */
DEBUGP("Throwing invalid url exception\n");
throw Exception("Invalid character after ':'");
}
this->mPort = port_of_Scheme(mScheme.c_str());
goto deal_with_path;
}
/* first colon delimits username:password */
s1 = (char *)memchr(hostinfo, ':', s - hostinfo);
if (s1) {
mUser.assign(hostinfo, s1 - hostinfo);
++s1;
mPassword.assign(s1, s - s1);
}
else {
mUser.assign(hostinfo, s - hostinfo);
}
hostinfo = s + 1;
goto deal_with_host;
}
Uri::~Uri() {
}
string Uri::scheme() const { return mScheme; }
void Uri::scheme(string scheme) {
mScheme = scheme;
}
string Uri::user() const { return mUser; }
void Uri::user(string user) {
mUser = user;
}
string Uri::password() const { return mPassword; }
void Uri::password(string password) {
mPassword = password;
}
string Uri::hostname() const { return mHostname; }
void Uri::hostname(string hostname) {
mHostname = hostname;
}
string Uri::path() const { return mPath; }
void Uri::path(string path) {
mPath = path;
}
bool Uri::existsFragment() const { return mExistsFragment; }
void Uri::existsFragment(bool existsFragment) {
mExistsFragment = existsFragment;
}
bool Uri::existsQuery() const { return mExistsQuery; }
void Uri::existsQuery(bool existsQuery) {
mExistsQuery = existsQuery;
}
string Uri::query() const { return mQuery; }
void Uri::query(string query) {
mQuery = query;
}
string Uri::fragment() const { return mFragment; }
void Uri::fragment(string fragment) {
mFragment = fragment;
}
unsigned int Uri::port() const { return mPort; }
void Uri::port(unsigned int port) { mPort = port; }
static const char *default_filenames[] = { "index", "default", NULL };
static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL };
static unsigned short default_port_for_scheme(const char *scheme_str)
{
schemes_t *scheme;
if (scheme_str == NULL)
return 0;
for (scheme = schemes; scheme->name != NULL; ++scheme)
if (strcasecmp(scheme_str, scheme->name) == 0)
return scheme->default_port;
return 0;
}
Uri Uri::absolute(const Uri &base) const
{
if (mScheme.empty())
{
Uri root(base);
if (root.mPath.empty()) root.mPath = "/";
if (mPath.empty())
{
if (mExistsQuery)
{
root.mQuery = mQuery;
root.mExistsQuery = mExistsQuery;
root.mFragment = mFragment;
root.mExistsFragment = mExistsFragment;
}
else if (mExistsFragment)
{
root.mFragment = mFragment;
root.mExistsFragment = mExistsFragment;
}
}
else if (mPath[0] == '/')
{
root.mPath = mPath;
root.mQuery = mQuery;
root.mExistsQuery = mExistsQuery;
root.mFragment = mFragment;
root.mExistsFragment = mExistsFragment;
}
else
{
string path(root.mPath);
string::size_type find;
find = path.rfind("/");
if (find != string::npos) path.erase(find+1);
path += mPath;
root.mPath = path;
root.mQuery = mQuery;
root.mExistsQuery = mExistsQuery;
root.mFragment = mFragment;
root.mExistsFragment = mExistsFragment;
}
return root;
}
if (mPath.empty())
{
Uri root(*this);
root.mPath = "/";
return root;
}
return *this;
}
string Uri::unparse(int flags ) const
{
string ret;
ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length());
DEBUGP("Unparsing scheme\n");
if(!(Uri::REMOVE_SCHEME & flags)) {
if(!mScheme.empty()) {
ret += mScheme;
ret += "://";
}
}
DEBUGP("Unparsing hostname\n");
if(!mHostname.empty()) {
size_t offset = 0;
if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) {
offset = wwwPrefixOffset(mHostname);
}
ret += (mHostname.c_str() + offset);
}
DEBUGP("Unparsing port\n");
if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str())))
{
ret += ':';
ret += mPortStr;
}
DEBUGP("Unparsing path\n");
if(!mPath.empty())
{
char *buf = new char[mPath.length() + 1];
memcpy(buf, mPath.c_str(), mPath.length() + 1);
if(flags & Uri::REMOVE_DEFAULT_FILENAMES) {
const char **ptr = default_extensions;
char *end = buf + mPath.length();
size_t offset = 0;
while(*ptr != NULL) {
size_t len = strlen(*ptr);
if((strcmp(end - len, *ptr)) == 0) {
offset = len;
break;
}
++ptr;
}
if(offset == 0) goto remove_bar;
ptr = default_filenames;
bool found = false;
while(*ptr != NULL) {
size_t len = strlen(*ptr);
if(strncmp(end - offset - len, *ptr, len) == 0) {
offset += len;
found = true;
break;
}
++ptr;
}
if(found) {
*(end - offset) = 0; //cut filename
}
}
remove_bar:
if(flags & Uri::REMOVE_TRAILING_BAR) {
if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar
buf[strlen(buf) - 1] = 0;
}
}
ret += buf;
delete [] buf;
}
DEBUGP("Unparsing query\n");
if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) {
ret += '?';
if(flags & Uri::REMOVE_QUERY_VALUES) {
const char *ptr = mQuery.c_str();
bool inside = false;
while(*ptr) {
if(*ptr == '=') {
inside = true;
}
if(*ptr == '&') {
inside = false;
}
if(inside) {
++ptr;
} else {
ret += *ptr;
++ptr;
}
}
} else {
ret += mQuery;
}
}
DEBUGP("Unparsing fragment\n");
if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment)
{
ret += '#';
ret += mFragment;
}
return ret;
}
static size_t wwwPrefixOffset(const std::string& hostname)
{
string::size_type len = hostname.length();
if(strncasecmp("www", hostname.c_str(), 3) == 0)
{
if(len > 3 && hostname[3] == '.')
{
return 4;
}
if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.')
{
return 5;
}
}
return 0;
}
std::string Uri::canonicalHostname(unsigned int maxDepth) const
{
size_t prefixOffset = wwwPrefixOffset(mHostname);
size_t suffixOffset = tldOffset(mHostname.c_str());
unsigned int depth = 0;
string::const_iterator canonicalStart = mHostname.begin() + prefixOffset;
string::const_iterator ptr = mHostname.begin();
ptr += mHostname.length() - suffixOffset;
while (depth < maxDepth && ptr > canonicalStart)
{
--ptr;
if (*ptr == '.') ++depth;
}
if (*ptr == '.') ++ptr;
return string(ptr, mHostname.end());
}
std::string Uri::decode(const std::string &uri)
{
//Note from RFC1630: "Sequences which start with a percent sign
//but are not followed by two hexadecimal characters (0-9,A-F) are reserved
//for future extension"
const unsigned char *ptr = (const unsigned char *)uri.c_str();
string ret;
ret.reserve(uri.length());
for (; *ptr; ++ptr)
{
if (*ptr == '%')
{
if (*(ptr + 1))
{
char a = *(ptr + 1);
char b = *(ptr + 2);
if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue;
if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue;
char buf[3];
buf[0] = a;
buf[1] = b;
buf[2] = 0;
ret += (char)strtoul(buf, NULL, 16);
ptr += 2;
continue;
}
}
ret += *ptr;
}
return ret;
}
//This vector is generated by safechars.py. Please do not edit by hand.
static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
std::string Uri::encode(const std::string &uri)
{
string ret;
const unsigned char *ptr = (const unsigned char *)uri.c_str();
ret.reserve(uri.length());
for (; *ptr ; ++ptr)
{
if (!safe[*ptr])
{
char buf[5];
memset(buf, 0, 5);
snprintf(buf, 5, "%%%X", (*ptr));
ret.append(buf);
}
else
{
ret += *ptr;
}
}
return ret;
}