546 lines
11 KiB
C++
546 lines
11 KiB
C++
#include <algorithm>
|
||
#include <cctype>
|
||
#include <cstring>
|
||
#include <strstream>
|
||
#include "Uri.h"
|
||
|
||
#include "utils.h"
|
||
|
||
using namespace std;
|
||
namespace htmlcxx {
|
||
namespace HTML {
|
||
|
||
bool detect_utf8(const char *begin, int size)
|
||
{
|
||
const char *ptr;
|
||
const char *end = begin+size;
|
||
const char *signature = "";
|
||
char previous_byte = 0;
|
||
unsigned count_bad_utf = 0;
|
||
unsigned count_good_utf = 0;
|
||
|
||
if (!strncmp(begin, signature, 3)) return true;
|
||
|
||
for (ptr = begin; ptr != end; ++ptr)
|
||
{
|
||
if ((*ptr & 0xC0) == 0x80)
|
||
{
|
||
if ((previous_byte & 0xC0) == 0xC0)
|
||
{
|
||
count_good_utf ++;
|
||
}
|
||
else if ((previous_byte & 0x80) == 0x00)
|
||
{
|
||
count_bad_utf ++;
|
||
}
|
||
}
|
||
else if ((previous_byte & 0xC0) == 0xC0)
|
||
{
|
||
count_bad_utf ++;
|
||
}
|
||
|
||
previous_byte = *ptr;
|
||
}
|
||
|
||
return count_good_utf > count_bad_utf;
|
||
}
|
||
|
||
string single_blank(const string &str) {
|
||
|
||
unsigned int count = 0;
|
||
bool first_space = true;
|
||
const char *ptr = str.c_str();
|
||
|
||
string ret(str.length(), ' ');
|
||
|
||
// Skip space at beginning
|
||
while (isspace(*ptr)) ++ptr;
|
||
|
||
while (*ptr)
|
||
{
|
||
if (isspace(*ptr))
|
||
{
|
||
if (first_space)
|
||
{
|
||
first_space = false;
|
||
ret[count++] = ' ';
|
||
}
|
||
}
|
||
else
|
||
{
|
||
first_space = true;
|
||
ret[count++] = *ptr;
|
||
}
|
||
|
||
++ptr;
|
||
}
|
||
|
||
// Trim space at the end
|
||
string::size_type a;
|
||
a = ret.find_last_not_of(' ', count);
|
||
if (a != string::npos)
|
||
ret.erase(a+1);
|
||
else
|
||
{
|
||
a = 0;
|
||
ret.erase(a);
|
||
}
|
||
|
||
return ret;
|
||
}
|
||
|
||
string strip_comments(const string &str) {
|
||
|
||
string ret;
|
||
ret.reserve(str.size());
|
||
|
||
const char *ptr = str.c_str();
|
||
const char *end = ptr + str.length();
|
||
|
||
bool inside_comment = false;
|
||
while(1) {
|
||
if(!inside_comment) {
|
||
if(ptr + 4 < end) {
|
||
if(*ptr == '<' && *(ptr+1) == '!' && *(ptr+2) =='-' && *(ptr + 3) == '-' && isspace(*(ptr + 4))) {
|
||
inside_comment = true;
|
||
}
|
||
}
|
||
} else {
|
||
if(ptr + 2 < end) {
|
||
if(*ptr == '-' && *(ptr+1) == '-' && *(ptr+2) == '>' ) {
|
||
inside_comment = false;
|
||
ptr += 3;
|
||
}
|
||
}
|
||
}
|
||
if(ptr == end) break;
|
||
if(!inside_comment) ret += *ptr;
|
||
ptr++;
|
||
}
|
||
|
||
ret.resize(ret.size());
|
||
|
||
return ret;
|
||
|
||
}
|
||
|
||
static struct {
|
||
const char *str;
|
||
unsigned char chr;
|
||
} entities[] = {
|
||
/* 00 */
|
||
{ "quot", 34 },
|
||
{ "amp", 38 },
|
||
{ "lt", 60 },
|
||
{ "gt", 62 },
|
||
{ "nbsp", ' ' },
|
||
{ "iexcl", 161 },
|
||
{ "cent", 162 },
|
||
{ "pound", 163 },
|
||
{ "curren", 164 },
|
||
{ "yen", 165 },
|
||
/* 10 */
|
||
{ "brvbar", 166 },
|
||
{ "sect", 167 },
|
||
{ "uml", 168 },
|
||
{ "copy", 169 },
|
||
{ "ordf", 170 },
|
||
{ "laquo", 171 },
|
||
{ "not", 172 },
|
||
{ "shy", 173 },
|
||
{ "reg", 174 },
|
||
{ "macr", 175 },
|
||
/* 20 */
|
||
{ "deg", 176 },
|
||
{ "plusmn", 177 },
|
||
{ "sup2", 178 },
|
||
{ "sup3", 179 },
|
||
{ "acute", 180 },
|
||
{ "micro", 181 },
|
||
{ "para", 182 },
|
||
{ "middot", 183 },
|
||
{ "cedil", 184 },
|
||
{ "sup1", 185 },
|
||
/* 30 */
|
||
{ "ordm", 186 },
|
||
{ "raquo", 187 },
|
||
{ "frac14", 188 },
|
||
{ "frac12", 189 },
|
||
{ "frac34", 190 },
|
||
{ "iquest", 191 },
|
||
{ "Agrave", 192 },
|
||
{ "Aacute", 193 },
|
||
{ "Acirc", 194 },
|
||
{ "Atilde", 195 },
|
||
/* 40 */
|
||
{ "Auml", 196 },
|
||
{ "ring", 197 },
|
||
{ "AElig", 198 },
|
||
{ "Ccedil", 199 },
|
||
{ "Egrave", 200 },
|
||
{ "Eacute", 201 },
|
||
{ "Ecirc", 202 },
|
||
{ "Euml", 203 },
|
||
{ "Igrave", 204 },
|
||
{ "Iacute", 205 },
|
||
/* 50 */
|
||
{ "Icirc", 206 },
|
||
{ "Iuml", 207 },
|
||
{ "ETH", 208 },
|
||
{ "Ntilde", 209 },
|
||
{ "Ograve", 210 },
|
||
{ "Oacute", 211 },
|
||
{ "Ocirc", 212 },
|
||
{ "Otilde", 213 },
|
||
{ "Ouml", 214 },
|
||
{ "times", 215 },
|
||
/* 60 */
|
||
{ "Oslash", 216 },
|
||
{ "Ugrave", 217 },
|
||
{ "Uacute", 218 },
|
||
{ "Ucirc", 219 },
|
||
{ "Uuml", 220 },
|
||
{ "Yacute", 221 },
|
||
{ "THORN", 222 },
|
||
{ "szlig", 223 },
|
||
{ "agrave", 224 },
|
||
{ "aacute", 225 },
|
||
/* 70 */
|
||
{ "acirc", 226 },
|
||
{ "atilde", 227 },
|
||
{ "auml", 228 },
|
||
{ "aring", 229 },
|
||
{ "aelig", 230 },
|
||
{ "ccedil", 231 },
|
||
{ "egrave", 232 },
|
||
{ "eacute", 233 },
|
||
{ "ecirc", 234 },
|
||
{ "euml", 235 },
|
||
/* 80 */
|
||
{ "igrave", 236 },
|
||
{ "iacute", 237 },
|
||
{ "icirc", 238 },
|
||
{ "iuml", 239 },
|
||
{ "ieth", 240 },
|
||
{ "ntilde", 241 },
|
||
{ "ograve", 242 },
|
||
{ "oacute", 243 },
|
||
{ "ocirc", 244 },
|
||
{ "otilde", 245 },
|
||
/* 90 */
|
||
{ "ouml", 246 },
|
||
{ "divide", 247 },
|
||
{ "oslash", 248 },
|
||
{ "ugrave", 249 },
|
||
{ "uacute", 250 },
|
||
{ "ucirc", 251 },
|
||
{ "uuml", 252 },
|
||
{ "yacute", 253 },
|
||
{ "thorn", 254 },
|
||
{ "yuml", 255 },
|
||
/* 100 */
|
||
{ NULL, 0 },
|
||
};
|
||
|
||
string decode_entities(const string &str)
|
||
{
|
||
unsigned int count = 0;
|
||
const char *ptr = str.c_str();
|
||
const char *end;
|
||
|
||
string ret(str);
|
||
string entity;
|
||
|
||
ptr = strchr(ptr, '&');
|
||
if (ptr == NULL) return ret;
|
||
|
||
count += static_cast<unsigned int>(ptr - str.c_str());
|
||
|
||
// printf("url_init: %s\n", str.c_str());
|
||
while (*ptr)
|
||
{
|
||
if (*ptr == '&' && ((end = strchr(ptr, ';')) != NULL))
|
||
{
|
||
entity.assign(ptr + 1, end);
|
||
// printf("Entity: %d %s\n", entity.length(), entity.c_str());
|
||
if (!entity.empty() && entity[0] == '#')
|
||
{
|
||
entity.erase(0, 1);
|
||
int chr = atoi(entity.c_str());
|
||
if (chr > 0 && chr <= UCHAR_MAX)
|
||
{
|
||
ret[count++] = chr;
|
||
}
|
||
ptr = end + 1;
|
||
}
|
||
else
|
||
{
|
||
bool found = false;
|
||
for (int i = 0; entities[i].str != NULL; i++)
|
||
{
|
||
if (entity == entities[i].str)
|
||
{
|
||
found = true;
|
||
ret[count++] = entities[i].chr;
|
||
ptr = end + 1;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (!found)
|
||
{
|
||
ret[count++] = *ptr++;
|
||
}
|
||
}
|
||
}
|
||
else
|
||
{
|
||
ret[count++] = *ptr++;
|
||
}
|
||
}
|
||
|
||
ret.erase(count);
|
||
|
||
// printf("url_end: %s\n", ret.c_str());
|
||
return ret;
|
||
}
|
||
|
||
string get_attribute(const string& tag, const string& attr) {
|
||
string val;
|
||
string low_tag(tag);
|
||
string low_attr(attr);
|
||
|
||
transform(low_attr.begin(), low_attr.end(), low_attr.begin(), ::tolower);
|
||
transform(low_tag.begin(), low_tag.end(), low_tag.begin(), ::tolower);
|
||
|
||
string::size_type a;
|
||
a = low_tag.find(low_attr);
|
||
if (a == string::npos)
|
||
return val;
|
||
|
||
a += attr.length();
|
||
while (a < tag.length() && isspace(tag[a])) a++;
|
||
if (a == tag.length() || tag[a] != '=')
|
||
return val;
|
||
a++;
|
||
while (a < tag.length() && isspace(tag[a])) a++;
|
||
if (a == tag.length())
|
||
return val;
|
||
|
||
if (tag[a] == '"') {
|
||
string::size_type b = tag.find('"', a+1);
|
||
if (b == string::npos) return val;
|
||
val = tag.substr(a+1, b-a-1);
|
||
} else if (tag[a] == '\'') {
|
||
string::size_type b = tag.find('\'', a+1);
|
||
if (b == string::npos) return val;
|
||
val = tag.substr(a+1, b-a-1);
|
||
} else {
|
||
while (a < tag.length() && !isspace(tag[a]) && tag[a] != '>') {
|
||
val += tag[a++];
|
||
}
|
||
}
|
||
|
||
return val;
|
||
}
|
||
|
||
string normalize_slashs(const string &url)
|
||
{
|
||
const int NONE = 0;
|
||
const int LASTSLASH = 1;
|
||
const int LASTDOTSLASH = 2;
|
||
const int LASTDOTDOTSLASH = 3;
|
||
int state = NONE;
|
||
const char *question_dash;
|
||
const char *question;
|
||
const char *dash;
|
||
unsigned int count = 0;
|
||
const char *ptr = url.c_str();
|
||
string ret(url);
|
||
|
||
question = strchr(ptr, '?');
|
||
dash = strchr(ptr, '#');
|
||
if (question &&(!dash || question < dash)) question_dash = question;
|
||
else question_dash = dash;
|
||
if (question_dash == 0) question_dash = url.c_str() + url.length();
|
||
|
||
const char *problem;
|
||
const char *problem1 = strstr(ptr, "//");
|
||
const char *problem2 = strstr(ptr, "/.");
|
||
|
||
if (problem1 && (!problem2 || problem1 < problem2)) problem = problem1;
|
||
else problem = problem2;
|
||
|
||
if (problem && problem < question_dash)
|
||
{
|
||
ptr = problem;
|
||
count = static_cast<unsigned int>(ptr - url.c_str());
|
||
while (*ptr && ptr < question_dash)
|
||
{
|
||
switch (state)
|
||
{
|
||
case LASTSLASH:
|
||
if (*ptr == '/')
|
||
{
|
||
++ptr;
|
||
state = LASTSLASH;
|
||
}
|
||
else if (*ptr == '.')
|
||
{
|
||
++ptr;
|
||
state = LASTDOTSLASH;
|
||
}
|
||
else
|
||
{
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
state = NONE;
|
||
}
|
||
break;
|
||
case LASTDOTSLASH:
|
||
if (*ptr == '/')
|
||
{
|
||
++ptr;
|
||
state = LASTSLASH;
|
||
}
|
||
else if (*ptr == '.')
|
||
{
|
||
++ptr;
|
||
state = LASTDOTDOTSLASH;
|
||
}
|
||
else
|
||
{
|
||
ret[count++] = '.';
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
state = NONE;
|
||
}
|
||
break;
|
||
case LASTDOTDOTSLASH:
|
||
if (*ptr == '/')
|
||
{
|
||
const char *last_slash = ret.c_str() + count - 2;
|
||
while (last_slash >= ret.c_str() && *last_slash != '/')
|
||
--last_slash;
|
||
if (last_slash >= ret.c_str())
|
||
count = static_cast<unsigned int>(last_slash - ret.c_str() + 1);
|
||
++ptr;
|
||
state = LASTSLASH;
|
||
}
|
||
else
|
||
{
|
||
ret[count++] = '.';
|
||
ret[count++] = '.';
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
state = NONE;
|
||
}
|
||
break;
|
||
default:
|
||
if (*ptr == '/')
|
||
{
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
state = LASTSLASH;
|
||
}
|
||
else
|
||
{
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
state = NONE;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (question_dash)
|
||
{
|
||
while (*ptr)
|
||
{
|
||
ret[count++] = *ptr;
|
||
++ptr;
|
||
}
|
||
}
|
||
|
||
ret.erase(count);
|
||
}
|
||
|
||
return ret;
|
||
}
|
||
|
||
string convert_link(const string& relative, const Uri& root)
|
||
{
|
||
string url(relative);
|
||
|
||
url = HTML::decode_entities(url);
|
||
|
||
string::size_type a;
|
||
a = 0;
|
||
while ((a = url.find_first_of(" \r\n", a)) != string::npos)
|
||
{
|
||
switch (url[a])
|
||
{
|
||
case ' ':
|
||
url.replace(a, 1, "%20");
|
||
break;
|
||
case '\r':
|
||
url.erase(a, 1);
|
||
break;
|
||
case '\n':
|
||
url.erase(a, 1);
|
||
break;
|
||
}
|
||
}
|
||
|
||
Uri uri;
|
||
try
|
||
{
|
||
Uri rel(url);
|
||
uri = rel.absolute(root);
|
||
uri.path(normalize_slashs(uri.path()));
|
||
}
|
||
catch (Uri::Exception)
|
||
{
|
||
return string();
|
||
}
|
||
|
||
return uri.unparse(Uri::REMOVE_FRAGMENT);
|
||
}
|
||
|
||
string __serialize_gml(const tree<HTML::Node> &tr, tree<HTML::Node>::iterator it, tree<HTML::Node>::iterator end, unsigned int parent_id, unsigned int& label) {
|
||
|
||
using namespace std;
|
||
ostrstream ret;
|
||
tree<HTML::Node>::sibling_iterator sib = tr.begin(it);
|
||
while(sib != tr.end(it)) {
|
||
ret << "node [ id " << ++label << "\n label \"" << label << "\"\n]\n";
|
||
ret << "edge [ \n source " << parent_id << "\n target " << label << "\n]" << endl;
|
||
ret << __serialize_gml(tr, sib, end, label, label);
|
||
++sib;
|
||
}
|
||
ret << ends;
|
||
string str = ret.str();
|
||
ret.freeze(0);
|
||
return str;
|
||
}
|
||
|
||
|
||
string serialize_gml(const tree<HTML::Node> &tr) {
|
||
|
||
using namespace std;
|
||
|
||
tree<HTML::Node>::pre_order_iterator it = tr.begin();
|
||
tree<HTML::Node>::pre_order_iterator end = tr.end();
|
||
|
||
string ret;
|
||
ret += "graph [";
|
||
ret += "directed 1\n";
|
||
ret += "node [ id 0\n label \"0\"\n ]\n";
|
||
unsigned int label = 0;
|
||
ret += __serialize_gml(tr, it, end, 0, label);
|
||
ret += "]";
|
||
return ret;
|
||
|
||
}
|
||
|
||
}//namespace html
|
||
}//namespace htmlcxx
|