#include "Uri.h" #include "wincstring.h" #include #include #include #include "tld.h" //#define DEBUG #include "debug.h" using namespace std; using namespace htmlcxx; /** Structure to store various schemes and their default ports */ struct schemes_t { /** The name of the scheme */ const char *name; /** The default port for the scheme */ unsigned int default_port; }; /* Some WWW schemes and their default ports; this is basically /etc/services */ /* This will become global when the protocol abstraction comes */ /* As the schemes are searched by a linear search, */ /* they are sorted by their expected frequency */ static schemes_t schemes[] = { {"http", Uri::URI_HTTP_DEFAULT_PORT}, {"ftp", Uri::URI_FTP_DEFAULT_PORT}, {"https", Uri::URI_HTTPS_DEFAULT_PORT}, {"gopher", Uri::URI_GOPHER_DEFAULT_PORT}, {"ldap", Uri::URI_LDAP_DEFAULT_PORT}, {"nntp", Uri::URI_NNTP_DEFAULT_PORT}, {"snews", Uri::URI_SNEWS_DEFAULT_PORT}, {"imap", Uri::URI_IMAP_DEFAULT_PORT}, {"pop", Uri::URI_POP_DEFAULT_PORT}, {"sip", Uri::URI_SIP_DEFAULT_PORT}, {"rtsp", Uri::URI_RTSP_DEFAULT_PORT}, {"wais", Uri::URI_WAIS_DEFAULT_PORT}, {"z39.50r", Uri::URI_WAIS_DEFAULT_PORT}, {"z39.50s", Uri::URI_WAIS_DEFAULT_PORT}, {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT}, {"nfs", Uri::URI_NFS_DEFAULT_PORT}, {"tip", Uri::URI_TIP_DEFAULT_PORT}, {"acap", Uri::URI_ACAP_DEFAULT_PORT}, {"telnet", Uri::URI_TELNET_DEFAULT_PORT}, {"ssh", Uri::URI_SSH_DEFAULT_PORT}, { NULL, 0xFFFF } /* unknown port */ }; static unsigned int port_of_Scheme(const char *scheme_str) { schemes_t *scheme; if (scheme_str) { for (scheme = schemes; scheme->name != NULL; ++scheme) { if (strcasecmp(scheme_str, scheme->name) == 0) { return scheme->default_port; } } } return 0; } /* We have a apr_table_t that we can index by character and it tells us if the * character is one of the interesting delimiters. Note that we even get * compares for NUL for free -- it's just another delimiter. */ #define T_COLON 0x01 /* ':' */ #define T_SLASH 0x02 /* '/' */ #define T_QUESTION 0x04 /* '?' */ #define T_HASH 0x08 /* '#' */ #define T_NUL 0x80 /* '\0' */ /* the uri_delims.h file is autogenerated by gen_uri_delims.c */ /* this file is automatically generated by gen_uri_delims, do not edit */ static const unsigned char uri_delims[256] = { T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0, 0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0, 0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; /* it works like this: if (uri_delims[ch] & NOTEND_foobar) { then we're not at a delimiter for foobar } */ /* Note that we optimize the scheme scanning here, we cheat and let the * compiler know that it doesn't have to do the & masking. */ #define NOTEND_SCHEME (0xff) #define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL) #define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL) static size_t wwwPrefixOffset(const std::string& hostname); Uri::Uri() : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0) {} Uri::Uri(const string &uri_str) : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0) { init(uri_str); } void Uri::init(const string &uri_str) { DEBUGP("Parsing uri %s\n", uri_str.c_str()); if(uri_str.empty()) return; const char *uri = uri_str.c_str(); const char *s; const char *s1; const char *hostinfo; char *endstr; /* We assume the processor has a branch predictor like most -- * it assumes forward branches are untaken and backwards are taken. That's * the reason for the gotos. -djg */ if (uri[0] == '/') { deal_with_path: DEBUGP("Dealing with path\n"); /* we expect uri to point to first character of path ... remember * that the path could be empty -- http://foobar?query for example */ s = uri; while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) { ++s; } if (s != uri) { mPath.assign(uri, s - uri); DEBUGP("Path is %s\n", mPath.c_str()); } if (*s == 0) { return; } if (*s == '?') { ++s; s1 = strchr(s, '#'); if (s1) { mFragment.assign(s1 + 1); mExistsFragment = true; DEBUGP("Fragment is %s\n", mFragment.c_str()); mQuery.assign(s, s1 - s); mExistsQuery = true; DEBUGP("Query is %s\n", mQuery.c_str()); } else { mQuery.assign(s); mExistsQuery = true; DEBUGP("Query is %s\n", mQuery.c_str()); } return; } /* otherwise it's a fragment */ mFragment.assign(s + 1); mExistsFragment = true; DEBUGP("Fragment is %s\n", mFragment.c_str()); return; } DEBUGP("Dealing with scheme\n"); /* find the scheme: */ if (!isalpha(*uri)) goto deal_with_path; s = uri; while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) { ++s; } /* scheme must be non-empty and followed by :// */ if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') { goto deal_with_path; /* backwards predicted taken! */ } mScheme.assign(uri, s - uri); DEBUGP("Scheme is %s\n", mScheme.c_str()); s += 3; DEBUGP("Finding hostinfo\n"); hostinfo = s; DEBUGP("Hostinfo is %s\n", hostinfo); while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) { ++s; } uri = s; /* whatever follows hostinfo is start of uri */ // mHostinfo.assign(hostinfo, uri - hostinfo); /* If there's a username:password@host:port, the @ we want is the last @... * too bad there's no memrchr()... For the C purists, note that hostinfo * is definately not the first character of the original uri so therefore * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C. */ do { --s; } while (s >= hostinfo && *s != '@'); if (s < hostinfo) { /* again we want the common case to be fall through */ deal_with_host: DEBUGP("Dealing with host\n"); /* We expect hostinfo to point to the first character of * the hostname. If there's a port it is the first colon. */ s = (char *)memchr(hostinfo, ':', uri - hostinfo); if (s == NULL) { /* we expect the common case to have no port */ mHostname.assign(hostinfo, uri - hostinfo); DEBUGP("Hostname is %s\n", mHostname.c_str()); goto deal_with_path; } mHostname.assign(hostinfo, s - hostinfo); DEBUGP("Hostname is %s\n", mHostname.c_str()); ++s; if (uri != s) { mPortStr.assign(s, uri - s); mPort = strtol(mPortStr.c_str(), &endstr, 10); if (*endstr == '\0') { goto deal_with_path; } /* Invalid characters after ':' found */ DEBUGP("Throwing invalid url exception\n"); throw Exception("Invalid character after ':'"); } this->mPort = port_of_Scheme(mScheme.c_str()); goto deal_with_path; } /* first colon delimits username:password */ s1 = (char *)memchr(hostinfo, ':', s - hostinfo); if (s1) { mUser.assign(hostinfo, s1 - hostinfo); ++s1; mPassword.assign(s1, s - s1); } else { mUser.assign(hostinfo, s - hostinfo); } hostinfo = s + 1; goto deal_with_host; } Uri::~Uri() { } string Uri::scheme() const { return mScheme; } void Uri::scheme(string scheme) { mScheme = scheme; } string Uri::user() const { return mUser; } void Uri::user(string user) { mUser = user; } string Uri::password() const { return mPassword; } void Uri::password(string password) { mPassword = password; } string Uri::hostname() const { return mHostname; } void Uri::hostname(string hostname) { mHostname = hostname; } string Uri::path() const { return mPath; } void Uri::path(string path) { mPath = path; } bool Uri::existsFragment() const { return mExistsFragment; } void Uri::existsFragment(bool existsFragment) { mExistsFragment = existsFragment; } bool Uri::existsQuery() const { return mExistsQuery; } void Uri::existsQuery(bool existsQuery) { mExistsQuery = existsQuery; } string Uri::query() const { return mQuery; } void Uri::query(string query) { mQuery = query; } string Uri::fragment() const { return mFragment; } void Uri::fragment(string fragment) { mFragment = fragment; } unsigned int Uri::port() const { return mPort; } void Uri::port(unsigned int port) { mPort = port; } static const char *default_filenames[] = { "index", "default", NULL }; static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL }; static unsigned short default_port_for_scheme(const char *scheme_str) { schemes_t *scheme; if (scheme_str == NULL) return 0; for (scheme = schemes; scheme->name != NULL; ++scheme) if (strcasecmp(scheme_str, scheme->name) == 0) return scheme->default_port; return 0; } Uri Uri::absolute(const Uri &base) const { if (mScheme.empty()) { Uri root(base); if (root.mPath.empty()) root.mPath = "/"; if (mPath.empty()) { if (mExistsQuery) { root.mQuery = mQuery; root.mExistsQuery = mExistsQuery; root.mFragment = mFragment; root.mExistsFragment = mExistsFragment; } else if (mExistsFragment) { root.mFragment = mFragment; root.mExistsFragment = mExistsFragment; } } else if (mPath[0] == '/') { root.mPath = mPath; root.mQuery = mQuery; root.mExistsQuery = mExistsQuery; root.mFragment = mFragment; root.mExistsFragment = mExistsFragment; } else { string path(root.mPath); string::size_type find; find = path.rfind("/"); if (find != string::npos) path.erase(find+1); path += mPath; root.mPath = path; root.mQuery = mQuery; root.mExistsQuery = mExistsQuery; root.mFragment = mFragment; root.mExistsFragment = mExistsFragment; } return root; } if (mPath.empty()) { Uri root(*this); root.mPath = "/"; return root; } return *this; } string Uri::unparse(int flags ) const { string ret; ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length()); DEBUGP("Unparsing scheme\n"); if(!(Uri::REMOVE_SCHEME & flags)) { if(!mScheme.empty()) { ret += mScheme; ret += "://"; } } DEBUGP("Unparsing hostname\n"); if(!mHostname.empty()) { size_t offset = 0; if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) { offset = wwwPrefixOffset(mHostname); } ret += (mHostname.c_str() + offset); } DEBUGP("Unparsing port\n"); if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str()))) { ret += ':'; ret += mPortStr; } DEBUGP("Unparsing path\n"); if(!mPath.empty()) { char *buf = new char[mPath.length() + 1]; memcpy(buf, mPath.c_str(), mPath.length() + 1); if(flags & Uri::REMOVE_DEFAULT_FILENAMES) { const char **ptr = default_extensions; char *end = buf + mPath.length(); size_t offset = 0; while(*ptr != NULL) { size_t len = strlen(*ptr); if((strcmp(end - len, *ptr)) == 0) { offset = len; break; } ++ptr; } if(offset == 0) goto remove_bar; ptr = default_filenames; bool found = false; while(*ptr != NULL) { size_t len = strlen(*ptr); if(strncmp(end - offset - len, *ptr, len) == 0) { offset += len; found = true; break; } ++ptr; } if(found) { *(end - offset) = 0; //cut filename } } remove_bar: if(flags & Uri::REMOVE_TRAILING_BAR) { if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar buf[strlen(buf) - 1] = 0; } } ret += buf; delete [] buf; } DEBUGP("Unparsing query\n"); if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) { ret += '?'; if(flags & Uri::REMOVE_QUERY_VALUES) { const char *ptr = mQuery.c_str(); bool inside = false; while(*ptr) { if(*ptr == '=') { inside = true; } if(*ptr == '&') { inside = false; } if(inside) { ++ptr; } else { ret += *ptr; ++ptr; } } } else { ret += mQuery; } } DEBUGP("Unparsing fragment\n"); if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment) { ret += '#'; ret += mFragment; } return ret; } static size_t wwwPrefixOffset(const std::string& hostname) { string::size_type len = hostname.length(); if(strncasecmp("www", hostname.c_str(), 3) == 0) { if(len > 3 && hostname[3] == '.') { return 4; } if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.') { return 5; } } return 0; } std::string Uri::canonicalHostname(unsigned int maxDepth) const { size_t prefixOffset = wwwPrefixOffset(mHostname); size_t suffixOffset = tldOffset(mHostname.c_str()); unsigned int depth = 0; string::const_iterator canonicalStart = mHostname.begin() + prefixOffset; string::const_iterator ptr = mHostname.begin(); ptr += mHostname.length() - suffixOffset; while (depth < maxDepth && ptr > canonicalStart) { --ptr; if (*ptr == '.') ++depth; } if (*ptr == '.') ++ptr; return string(ptr, mHostname.end()); } std::string Uri::decode(const std::string &uri) { //Note from RFC1630: "Sequences which start with a percent sign //but are not followed by two hexadecimal characters (0-9,A-F) are reserved //for future extension" const unsigned char *ptr = (const unsigned char *)uri.c_str(); string ret; ret.reserve(uri.length()); for (; *ptr; ++ptr) { if (*ptr == '%') { if (*(ptr + 1)) { char a = *(ptr + 1); char b = *(ptr + 2); if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue; if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue; char buf[3]; buf[0] = a; buf[1] = b; buf[2] = 0; ret += (char)strtoul(buf, NULL, 16); ptr += 2; continue; } } ret += *ptr; } return ret; } //This vector is generated by safechars.py. Please do not edit by hand. static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; std::string Uri::encode(const std::string &uri) { string ret; const unsigned char *ptr = (const unsigned char *)uri.c_str(); ret.reserve(uri.length()); for (; *ptr ; ++ptr) { if (!safe[*ptr]) { char buf[5]; memset(buf, 0, 5); snprintf(buf, 5, "%%%X", (*ptr)); ret.append(buf); } else { ret += *ptr; } } return ret; }