406 lines
8.3 KiB
C++
406 lines
8.3 KiB
C++
#include <cctype>
|
|
#include <cstring>
|
|
/*#if !defined(WIN32) || defined(__MINGW32__)
|
|
#include <strings.h>
|
|
#endif*/
|
|
|
|
//#define DEBUG
|
|
//#include "debug.h"
|
|
|
|
static
|
|
struct literal_tag {
|
|
int len;
|
|
const char* str;
|
|
int is_cdata;
|
|
}
|
|
literal_mode_elem[] =
|
|
{
|
|
{6, "script", 1},
|
|
{5, "style", 1},
|
|
{3, "xmp", 1},
|
|
{9, "plaintext", 1},
|
|
{8, "textarea", 0},
|
|
{0, 0, 0}
|
|
};
|
|
|
|
template <typename _Iterator>
|
|
void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end)
|
|
{
|
|
// std::cerr << "Parsing iterator" << std::endl;
|
|
parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category());
|
|
}
|
|
|
|
template <typename _Iterator>
|
|
void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag)
|
|
{
|
|
typedef _Iterator iterator;
|
|
// std::cerr << "Parsing forward_iterator" << std::endl;
|
|
mCdata = false;
|
|
mpLiteral = 0;
|
|
mCurrentOffset = 0;
|
|
this->beginParsing();
|
|
|
|
// DEBUGP("Parsed text\n");
|
|
|
|
while (begin != end)
|
|
{
|
|
*begin; // This is for the multi_pass to release the buffer
|
|
iterator c(begin);
|
|
|
|
while (c != end)
|
|
{
|
|
// For some tags, the text inside it is considered literal and is
|
|
// only closed for its </TAG> counterpart
|
|
while (mpLiteral)
|
|
{
|
|
// DEBUGP("Treating literal %s\n", mpLiteral);
|
|
while (c != end && *c != '<') ++c;
|
|
|
|
if (c == end) {
|
|
if (c != begin) this->parseContent(begin, c);
|
|
goto DONE;
|
|
}
|
|
|
|
iterator end_text(c);
|
|
++c;
|
|
|
|
if (*c == '/')
|
|
{
|
|
++c;
|
|
const char *l = mpLiteral;
|
|
while (*l && ::tolower(*c) == *l)
|
|
{
|
|
++c;
|
|
++l;
|
|
}
|
|
|
|
// FIXME: Mozilla stops when it sees a /plaintext. Check
|
|
// other browsers and decide what to do
|
|
if (!*l && strcmp(mpLiteral, "plaintext"))
|
|
{
|
|
// matched all and is not tag plaintext
|
|
while (isspace(*c)) ++c;
|
|
|
|
if (*c == '>')
|
|
{
|
|
++c;
|
|
if (begin != end_text)
|
|
this->parseContent(begin, end_text);
|
|
mpLiteral = 0;
|
|
c = end_text;
|
|
begin = c;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else if (*c == '!')
|
|
{
|
|
// we may find a comment and we should support it
|
|
iterator e(c);
|
|
++e;
|
|
|
|
if (e != end && *e == '-' && ++e != end && *e == '-')
|
|
{
|
|
// DEBUGP("Parsing comment\n");
|
|
++e;
|
|
c = this->skipHtmlComment(e, end);
|
|
}
|
|
|
|
//if (begin != end_text)
|
|
//this->parseContent(begin, end_text, end);
|
|
|
|
//this->parseComment(end_text, c, end);
|
|
|
|
// continue from the end of the comment
|
|
//begin = c;
|
|
}
|
|
}
|
|
|
|
if (*c == '<')
|
|
{
|
|
iterator d(c);
|
|
++d;
|
|
if (d != end)
|
|
{
|
|
if (isalpha(*d))
|
|
{
|
|
// beginning of tag
|
|
if (begin != c)
|
|
this->parseContent(begin, c);
|
|
|
|
// DEBUGP("Parsing beginning of tag\n");
|
|
d = this->skipHtmlTag(d, end);
|
|
this->parseHtmlTag(c, d);
|
|
|
|
// continue from the end of the tag
|
|
c = d;
|
|
begin = c;
|
|
break;
|
|
}
|
|
|
|
if (*d == '/')
|
|
{
|
|
if (begin != c)
|
|
this->parseContent(begin, c);
|
|
|
|
iterator e(d);
|
|
++e;
|
|
if (e != end && isalpha(*e))
|
|
{
|
|
// end of tag
|
|
// DEBUGP("Parsing end of tag\n");
|
|
d = this->skipHtmlTag(d, end);
|
|
this->parseHtmlTag(c, d);
|
|
}
|
|
else
|
|
{
|
|
// not a conforming end of tag, treat as comment
|
|
// as Mozilla does
|
|
// DEBUGP("Non conforming end of tag\n");
|
|
d = this->skipHtmlTag(d, end);
|
|
this->parseComment(c, d);
|
|
}
|
|
|
|
// continue from the end of the tag
|
|
c = d;
|
|
begin = c;
|
|
break;
|
|
}
|
|
|
|
if (*d == '!')
|
|
{
|
|
// comment
|
|
if (begin != c)
|
|
this->parseContent(begin, c);
|
|
|
|
iterator e(d);
|
|
++e;
|
|
|
|
if (e != end && *e == '-' && ++e != end && *e == '-')
|
|
{
|
|
// DEBUGP("Parsing comment\n");
|
|
++e;
|
|
d = this->skipHtmlComment(e, end);
|
|
}
|
|
else
|
|
{
|
|
d = this->skipHtmlTag(d, end);
|
|
}
|
|
|
|
this->parseComment(c, d);
|
|
|
|
// continue from the end of the comment
|
|
c = d;
|
|
begin = c;
|
|
break;
|
|
}
|
|
|
|
if (*d == '?' || *d == '%')
|
|
{
|
|
// something like <?xml or <%VBSCRIPT
|
|
if (begin != c)
|
|
this->parseContent(begin, c);
|
|
|
|
d = this->skipHtmlTag(d, end);
|
|
|
|
this->parseComment(c, d);
|
|
|
|
// continue from the end of the comment
|
|
c = d;
|
|
begin = c;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
c++;
|
|
}
|
|
|
|
// There may be some text in the end of the document
|
|
if (begin != c)
|
|
{
|
|
this->parseContent(begin, c);
|
|
begin = c;
|
|
}
|
|
}
|
|
|
|
DONE:
|
|
this->endParsing();
|
|
return;
|
|
}
|
|
|
|
template <typename _Iterator>
|
|
void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c)
|
|
{
|
|
// DEBUGP("Creating comment node %s\n", std::string(b, c).c_str());
|
|
htmlcxx::HTML::Node com_node;
|
|
//FIXME: set_tagname shouldn't be needed, but first I must check
|
|
//legacy code
|
|
std::string comment(b, c);
|
|
com_node.tagName(comment);
|
|
com_node.text(comment);
|
|
com_node.offset(mCurrentOffset);
|
|
com_node.length((unsigned int)comment.length());
|
|
com_node.isTag(false);
|
|
com_node.isComment(true);
|
|
|
|
mCurrentOffset += com_node.length();
|
|
|
|
// Call callback method
|
|
this->foundComment(com_node);
|
|
}
|
|
|
|
template <typename _Iterator>
|
|
void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c)
|
|
{
|
|
// DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str());
|
|
htmlcxx::HTML::Node txt_node;
|
|
//FIXME: set_tagname shouldn't be needed, but first I must check
|
|
//legacy code
|
|
std::string text(b, c);
|
|
txt_node.tagName(text);
|
|
txt_node.text(text);
|
|
txt_node.offset(mCurrentOffset);
|
|
txt_node.length((unsigned int)text.length());
|
|
txt_node.isTag(false);
|
|
txt_node.isComment(false);
|
|
|
|
mCurrentOffset += txt_node.length();
|
|
|
|
// Call callback method
|
|
this->foundText(txt_node);
|
|
}
|
|
|
|
template <typename _Iterator>
|
|
void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c)
|
|
{
|
|
_Iterator name_begin(b);
|
|
++name_begin;
|
|
bool is_end_tag = (*name_begin == '/');
|
|
if (is_end_tag) ++name_begin;
|
|
|
|
_Iterator name_end(name_begin);
|
|
while (name_end != c && isalnum(*name_end))
|
|
{
|
|
++name_end;
|
|
}
|
|
|
|
std::string name(name_begin, name_end);
|
|
// DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str());
|
|
|
|
if (!is_end_tag)
|
|
{
|
|
std::string::size_type tag_len = name.length();
|
|
for (int i = 0; literal_mode_elem[i].len; ++i)
|
|
{
|
|
if (tag_len == literal_mode_elem[i].len)
|
|
{
|
|
#if defined(WIN32) && !defined(__MINGW32__)
|
|
if (!_stricmp(name.c_str(), literal_mode_elem[i].str))
|
|
#else
|
|
if (!strcasecmp(name.c_str(), literal_mode_elem[i].str))
|
|
#endif
|
|
{
|
|
mpLiteral = literal_mode_elem[i].str;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
htmlcxx::HTML::Node tag_node;
|
|
//by now, length is just the size of the tag
|
|
std::string text(b, c);
|
|
tag_node.length(static_cast<unsigned int>(text.length()));
|
|
tag_node.tagName(name);
|
|
tag_node.text(text);
|
|
tag_node.offset(mCurrentOffset);
|
|
tag_node.isTag(true);
|
|
tag_node.isComment(false);
|
|
|
|
mCurrentOffset += tag_node.length();
|
|
|
|
this->foundTag(tag_node, is_end_tag);
|
|
}
|
|
|
|
template <typename _Iterator>
|
|
_Iterator
|
|
htmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end)
|
|
{
|
|
while ( c != end ) {
|
|
if (*c++ == '-' && c != end && *c == '-')
|
|
{
|
|
_Iterator d(c);
|
|
while (++c != end && isspace(*c));
|
|
if (c == end || *c++ == '>') break;
|
|
c = d;
|
|
}
|
|
}
|
|
|
|
return c;
|
|
}
|
|
|
|
namespace htmlcxx { namespace HTML {
|
|
|
|
template <typename _Iterator>
|
|
static inline
|
|
_Iterator find_next_quote(_Iterator c, _Iterator end, char quote)
|
|
{
|
|
// std::cerr << "generic find" << std::endl;
|
|
while (c != end && *c != quote) ++c;
|
|
return c;
|
|
}
|
|
|
|
template <>
|
|
inline
|
|
const char *find_next_quote(const char *c, const char *end, char quote)
|
|
{
|
|
// std::cerr << "fast find" << std::endl;
|
|
const char *d = reinterpret_cast<const char*>(memchr(c, quote, end - c));
|
|
|
|
if (d) return d;
|
|
else return end;
|
|
}
|
|
|
|
}}
|
|
|
|
template <typename _Iterator>
|
|
_Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end)
|
|
{
|
|
while (c != end && *c != '>')
|
|
{
|
|
if (*c != '=')
|
|
{
|
|
++c;
|
|
}
|
|
else
|
|
{ // found an attribute
|
|
++c;
|
|
while (c != end && isspace(*c)) ++c;
|
|
|
|
if (c == end) break;
|
|
|
|
if (*c == '\"' || *c == '\'')
|
|
{
|
|
_Iterator save(c);
|
|
char quote = *c++;
|
|
c = find_next_quote(c, end, quote);
|
|
// while (c != end && *c != quote) ++c;
|
|
// c = static_cast<char*>(memchr(c, quote, end - c));
|
|
if (c != end)
|
|
{
|
|
++c;
|
|
}
|
|
else
|
|
{
|
|
c = save;
|
|
++c;
|
|
}
|
|
// DEBUGP("Quotes: %s\n", std::string(save, c).c_str());
|
|
}
|
|
}
|
|
}
|
|
|
|
if (c != end) ++c;
|
|
|
|
return c;
|
|
}
|