reimu/htmlcxx/html/utils.cc
2018-07-10 13:54:56 +03:00

546 lines
11 KiB
C++
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <algorithm>
#include <cctype>
#include <cstring>
#include <strstream>
#include "Uri.h"
#include "utils.h"
using namespace std;
namespace htmlcxx {
namespace HTML {
bool detect_utf8(const char *begin, int size)
{
const char *ptr;
const char *end = begin+size;
const char *signature = "";
char previous_byte = 0;
unsigned count_bad_utf = 0;
unsigned count_good_utf = 0;
if (!strncmp(begin, signature, 3)) return true;
for (ptr = begin; ptr != end; ++ptr)
{
if ((*ptr & 0xC0) == 0x80)
{
if ((previous_byte & 0xC0) == 0xC0)
{
count_good_utf ++;
}
else if ((previous_byte & 0x80) == 0x00)
{
count_bad_utf ++;
}
}
else if ((previous_byte & 0xC0) == 0xC0)
{
count_bad_utf ++;
}
previous_byte = *ptr;
}
return count_good_utf > count_bad_utf;
}
string single_blank(const string &str) {
unsigned int count = 0;
bool first_space = true;
const char *ptr = str.c_str();
string ret(str.length(), ' ');
// Skip space at beginning
while (isspace(*ptr)) ++ptr;
while (*ptr)
{
if (isspace(*ptr))
{
if (first_space)
{
first_space = false;
ret[count++] = ' ';
}
}
else
{
first_space = true;
ret[count++] = *ptr;
}
++ptr;
}
// Trim space at the end
string::size_type a;
a = ret.find_last_not_of(' ', count);
if (a != string::npos)
ret.erase(a+1);
else
{
a = 0;
ret.erase(a);
}
return ret;
}
string strip_comments(const string &str) {
string ret;
ret.reserve(str.size());
const char *ptr = str.c_str();
const char *end = ptr + str.length();
bool inside_comment = false;
while(1) {
if(!inside_comment) {
if(ptr + 4 < end) {
if(*ptr == '<' && *(ptr+1) == '!' && *(ptr+2) =='-' && *(ptr + 3) == '-' && isspace(*(ptr + 4))) {
inside_comment = true;
}
}
} else {
if(ptr + 2 < end) {
if(*ptr == '-' && *(ptr+1) == '-' && *(ptr+2) == '>' ) {
inside_comment = false;
ptr += 3;
}
}
}
if(ptr == end) break;
if(!inside_comment) ret += *ptr;
ptr++;
}
ret.resize(ret.size());
return ret;
}
static struct {
const char *str;
unsigned char chr;
} entities[] = {
/* 00 */
{ "quot", 34 },
{ "amp", 38 },
{ "lt", 60 },
{ "gt", 62 },
{ "nbsp", ' ' },
{ "iexcl", 161 },
{ "cent", 162 },
{ "pound", 163 },
{ "curren", 164 },
{ "yen", 165 },
/* 10 */
{ "brvbar", 166 },
{ "sect", 167 },
{ "uml", 168 },
{ "copy", 169 },
{ "ordf", 170 },
{ "laquo", 171 },
{ "not", 172 },
{ "shy", 173 },
{ "reg", 174 },
{ "macr", 175 },
/* 20 */
{ "deg", 176 },
{ "plusmn", 177 },
{ "sup2", 178 },
{ "sup3", 179 },
{ "acute", 180 },
{ "micro", 181 },
{ "para", 182 },
{ "middot", 183 },
{ "cedil", 184 },
{ "sup1", 185 },
/* 30 */
{ "ordm", 186 },
{ "raquo", 187 },
{ "frac14", 188 },
{ "frac12", 189 },
{ "frac34", 190 },
{ "iquest", 191 },
{ "Agrave", 192 },
{ "Aacute", 193 },
{ "Acirc", 194 },
{ "Atilde", 195 },
/* 40 */
{ "Auml", 196 },
{ "ring", 197 },
{ "AElig", 198 },
{ "Ccedil", 199 },
{ "Egrave", 200 },
{ "Eacute", 201 },
{ "Ecirc", 202 },
{ "Euml", 203 },
{ "Igrave", 204 },
{ "Iacute", 205 },
/* 50 */
{ "Icirc", 206 },
{ "Iuml", 207 },
{ "ETH", 208 },
{ "Ntilde", 209 },
{ "Ograve", 210 },
{ "Oacute", 211 },
{ "Ocirc", 212 },
{ "Otilde", 213 },
{ "Ouml", 214 },
{ "times", 215 },
/* 60 */
{ "Oslash", 216 },
{ "Ugrave", 217 },
{ "Uacute", 218 },
{ "Ucirc", 219 },
{ "Uuml", 220 },
{ "Yacute", 221 },
{ "THORN", 222 },
{ "szlig", 223 },
{ "agrave", 224 },
{ "aacute", 225 },
/* 70 */
{ "acirc", 226 },
{ "atilde", 227 },
{ "auml", 228 },
{ "aring", 229 },
{ "aelig", 230 },
{ "ccedil", 231 },
{ "egrave", 232 },
{ "eacute", 233 },
{ "ecirc", 234 },
{ "euml", 235 },
/* 80 */
{ "igrave", 236 },
{ "iacute", 237 },
{ "icirc", 238 },
{ "iuml", 239 },
{ "ieth", 240 },
{ "ntilde", 241 },
{ "ograve", 242 },
{ "oacute", 243 },
{ "ocirc", 244 },
{ "otilde", 245 },
/* 90 */
{ "ouml", 246 },
{ "divide", 247 },
{ "oslash", 248 },
{ "ugrave", 249 },
{ "uacute", 250 },
{ "ucirc", 251 },
{ "uuml", 252 },
{ "yacute", 253 },
{ "thorn", 254 },
{ "yuml", 255 },
/* 100 */
{ NULL, 0 },
};
string decode_entities(const string &str)
{
unsigned int count = 0;
const char *ptr = str.c_str();
const char *end;
string ret(str);
string entity;
ptr = strchr(ptr, '&');
if (ptr == NULL) return ret;
count += static_cast<unsigned int>(ptr - str.c_str());
// printf("url_init: %s\n", str.c_str());
while (*ptr)
{
if (*ptr == '&' && ((end = strchr(ptr, ';')) != NULL))
{
entity.assign(ptr + 1, end);
// printf("Entity: %d %s\n", entity.length(), entity.c_str());
if (!entity.empty() && entity[0] == '#')
{
entity.erase(0, 1);
int chr = atoi(entity.c_str());
if (chr > 0 && chr <= UCHAR_MAX)
{
ret[count++] = chr;
}
ptr = end + 1;
}
else
{
bool found = false;
for (int i = 0; entities[i].str != NULL; i++)
{
if (entity == entities[i].str)
{
found = true;
ret[count++] = entities[i].chr;
ptr = end + 1;
break;
}
}
if (!found)
{
ret[count++] = *ptr++;
}
}
}
else
{
ret[count++] = *ptr++;
}
}
ret.erase(count);
// printf("url_end: %s\n", ret.c_str());
return ret;
}
string get_attribute(const string& tag, const string& attr) {
string val;
string low_tag(tag);
string low_attr(attr);
transform(low_attr.begin(), low_attr.end(), low_attr.begin(), ::tolower);
transform(low_tag.begin(), low_tag.end(), low_tag.begin(), ::tolower);
string::size_type a;
a = low_tag.find(low_attr);
if (a == string::npos)
return val;
a += attr.length();
while (a < tag.length() && isspace(tag[a])) a++;
if (a == tag.length() || tag[a] != '=')
return val;
a++;
while (a < tag.length() && isspace(tag[a])) a++;
if (a == tag.length())
return val;
if (tag[a] == '"') {
string::size_type b = tag.find('"', a+1);
if (b == string::npos) return val;
val = tag.substr(a+1, b-a-1);
} else if (tag[a] == '\'') {
string::size_type b = tag.find('\'', a+1);
if (b == string::npos) return val;
val = tag.substr(a+1, b-a-1);
} else {
while (a < tag.length() && !isspace(tag[a]) && tag[a] != '>') {
val += tag[a++];
}
}
return val;
}
string normalize_slashs(const string &url)
{
const int NONE = 0;
const int LASTSLASH = 1;
const int LASTDOTSLASH = 2;
const int LASTDOTDOTSLASH = 3;
int state = NONE;
const char *question_dash;
const char *question;
const char *dash;
unsigned int count = 0;
const char *ptr = url.c_str();
string ret(url);
question = strchr(ptr, '?');
dash = strchr(ptr, '#');
if (question &&(!dash || question < dash)) question_dash = question;
else question_dash = dash;
if (question_dash == 0) question_dash = url.c_str() + url.length();
const char *problem;
const char *problem1 = strstr(ptr, "//");
const char *problem2 = strstr(ptr, "/.");
if (problem1 && (!problem2 || problem1 < problem2)) problem = problem1;
else problem = problem2;
if (problem && problem < question_dash)
{
ptr = problem;
count = static_cast<unsigned int>(ptr - url.c_str());
while (*ptr && ptr < question_dash)
{
switch (state)
{
case LASTSLASH:
if (*ptr == '/')
{
++ptr;
state = LASTSLASH;
}
else if (*ptr == '.')
{
++ptr;
state = LASTDOTSLASH;
}
else
{
ret[count++] = *ptr;
++ptr;
state = NONE;
}
break;
case LASTDOTSLASH:
if (*ptr == '/')
{
++ptr;
state = LASTSLASH;
}
else if (*ptr == '.')
{
++ptr;
state = LASTDOTDOTSLASH;
}
else
{
ret[count++] = '.';
ret[count++] = *ptr;
++ptr;
state = NONE;
}
break;
case LASTDOTDOTSLASH:
if (*ptr == '/')
{
const char *last_slash = ret.c_str() + count - 2;
while (last_slash >= ret.c_str() && *last_slash != '/')
--last_slash;
if (last_slash >= ret.c_str())
count = static_cast<unsigned int>(last_slash - ret.c_str() + 1);
++ptr;
state = LASTSLASH;
}
else
{
ret[count++] = '.';
ret[count++] = '.';
ret[count++] = *ptr;
++ptr;
state = NONE;
}
break;
default:
if (*ptr == '/')
{
ret[count++] = *ptr;
++ptr;
state = LASTSLASH;
}
else
{
ret[count++] = *ptr;
++ptr;
state = NONE;
}
}
}
if (question_dash)
{
while (*ptr)
{
ret[count++] = *ptr;
++ptr;
}
}
ret.erase(count);
}
return ret;
}
string convert_link(const string& relative, const Uri& root)
{
string url(relative);
url = HTML::decode_entities(url);
string::size_type a;
a = 0;
while ((a = url.find_first_of(" \r\n", a)) != string::npos)
{
switch (url[a])
{
case ' ':
url.replace(a, 1, "%20");
break;
case '\r':
url.erase(a, 1);
break;
case '\n':
url.erase(a, 1);
break;
}
}
Uri uri;
try
{
Uri rel(url);
uri = rel.absolute(root);
uri.path(normalize_slashs(uri.path()));
}
catch (Uri::Exception)
{
return string();
}
return uri.unparse(Uri::REMOVE_FRAGMENT);
}
string __serialize_gml(const tree<HTML::Node> &tr, tree<HTML::Node>::iterator it, tree<HTML::Node>::iterator end, unsigned int parent_id, unsigned int& label) {
using namespace std;
ostrstream ret;
tree<HTML::Node>::sibling_iterator sib = tr.begin(it);
while(sib != tr.end(it)) {
ret << "node [ id " << ++label << "\n label \"" << label << "\"\n]\n";
ret << "edge [ \n source " << parent_id << "\n target " << label << "\n]" << endl;
ret << __serialize_gml(tr, sib, end, label, label);
++sib;
}
ret << ends;
string str = ret.str();
ret.freeze(0);
return str;
}
string serialize_gml(const tree<HTML::Node> &tr) {
using namespace std;
tree<HTML::Node>::pre_order_iterator it = tr.begin();
tree<HTML::Node>::pre_order_iterator end = tr.end();
string ret;
ret += "graph [";
ret += "directed 1\n";
ret += "node [ id 0\n label \"0\"\n ]\n";
unsigned int label = 0;
ret += __serialize_gml(tr, it, end, 0, label);
ret += "]";
return ret;
}
}//namespace html
}//namespace htmlcxx