#include #include /*#if !defined(WIN32) || defined(__MINGW32__) #include #endif*/ //#define DEBUG //#include "debug.h" static struct literal_tag { int len; const char* str; int is_cdata; } literal_mode_elem[] = { {6, "script", 1}, {5, "style", 1}, {3, "xmp", 1}, {9, "plaintext", 1}, {8, "textarea", 0}, {0, 0, 0} }; template void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end) { // std::cerr << "Parsing iterator" << std::endl; parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category()); } template void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag) { typedef _Iterator iterator; // std::cerr << "Parsing forward_iterator" << std::endl; mCdata = false; mpLiteral = 0; mCurrentOffset = 0; this->beginParsing(); // DEBUGP("Parsed text\n"); while (begin != end) { *begin; // This is for the multi_pass to release the buffer iterator c(begin); while (c != end) { // For some tags, the text inside it is considered literal and is // only closed for its counterpart while (mpLiteral) { // DEBUGP("Treating literal %s\n", mpLiteral); while (c != end && *c != '<') ++c; if (c == end) { if (c != begin) this->parseContent(begin, c); goto DONE; } iterator end_text(c); ++c; if (*c == '/') { ++c; const char *l = mpLiteral; while (*l && ::tolower(*c) == *l) { ++c; ++l; } // FIXME: Mozilla stops when it sees a /plaintext. Check // other browsers and decide what to do if (!*l && strcmp(mpLiteral, "plaintext")) { // matched all and is not tag plaintext while (isspace(*c)) ++c; if (*c == '>') { ++c; if (begin != end_text) this->parseContent(begin, end_text); mpLiteral = 0; c = end_text; begin = c; break; } } } else if (*c == '!') { // we may find a comment and we should support it iterator e(c); ++e; if (e != end && *e == '-' && ++e != end && *e == '-') { // DEBUGP("Parsing comment\n"); ++e; c = this->skipHtmlComment(e, end); } //if (begin != end_text) //this->parseContent(begin, end_text, end); //this->parseComment(end_text, c, end); // continue from the end of the comment //begin = c; } } if (*c == '<') { iterator d(c); ++d; if (d != end) { if (isalpha(*d)) { // beginning of tag if (begin != c) this->parseContent(begin, c); // DEBUGP("Parsing beginning of tag\n"); d = this->skipHtmlTag(d, end); this->parseHtmlTag(c, d); // continue from the end of the tag c = d; begin = c; break; } if (*d == '/') { if (begin != c) this->parseContent(begin, c); iterator e(d); ++e; if (e != end && isalpha(*e)) { // end of tag // DEBUGP("Parsing end of tag\n"); d = this->skipHtmlTag(d, end); this->parseHtmlTag(c, d); } else { // not a conforming end of tag, treat as comment // as Mozilla does // DEBUGP("Non conforming end of tag\n"); d = this->skipHtmlTag(d, end); this->parseComment(c, d); } // continue from the end of the tag c = d; begin = c; break; } if (*d == '!') { // comment if (begin != c) this->parseContent(begin, c); iterator e(d); ++e; if (e != end && *e == '-' && ++e != end && *e == '-') { // DEBUGP("Parsing comment\n"); ++e; d = this->skipHtmlComment(e, end); } else { d = this->skipHtmlTag(d, end); } this->parseComment(c, d); // continue from the end of the comment c = d; begin = c; break; } if (*d == '?' || *d == '%') { // something like parseContent(begin, c); d = this->skipHtmlTag(d, end); this->parseComment(c, d); // continue from the end of the comment c = d; begin = c; break; } } } c++; } // There may be some text in the end of the document if (begin != c) { this->parseContent(begin, c); begin = c; } } DONE: this->endParsing(); return; } template void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c) { // DEBUGP("Creating comment node %s\n", std::string(b, c).c_str()); htmlcxx::HTML::Node com_node; //FIXME: set_tagname shouldn't be needed, but first I must check //legacy code std::string comment(b, c); com_node.tagName(comment); com_node.text(comment); com_node.offset(mCurrentOffset); com_node.length((unsigned int)comment.length()); com_node.isTag(false); com_node.isComment(true); mCurrentOffset += com_node.length(); // Call callback method this->foundComment(com_node); } template void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c) { // DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str()); htmlcxx::HTML::Node txt_node; //FIXME: set_tagname shouldn't be needed, but first I must check //legacy code std::string text(b, c); txt_node.tagName(text); txt_node.text(text); txt_node.offset(mCurrentOffset); txt_node.length((unsigned int)text.length()); txt_node.isTag(false); txt_node.isComment(false); mCurrentOffset += txt_node.length(); // Call callback method this->foundText(txt_node); } template void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c) { _Iterator name_begin(b); ++name_begin; bool is_end_tag = (*name_begin == '/'); if (is_end_tag) ++name_begin; _Iterator name_end(name_begin); while (name_end != c && isalnum(*name_end)) { ++name_end; } std::string name(name_begin, name_end); // DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str()); if (!is_end_tag) { std::string::size_type tag_len = name.length(); for (int i = 0; literal_mode_elem[i].len; ++i) { if (tag_len == literal_mode_elem[i].len) { #if defined(WIN32) && !defined(__MINGW32__) if (!_stricmp(name.c_str(), literal_mode_elem[i].str)) #else if (!strcasecmp(name.c_str(), literal_mode_elem[i].str)) #endif { mpLiteral = literal_mode_elem[i].str; break; } } } } htmlcxx::HTML::Node tag_node; //by now, length is just the size of the tag std::string text(b, c); tag_node.length(static_cast(text.length())); tag_node.tagName(name); tag_node.text(text); tag_node.offset(mCurrentOffset); tag_node.isTag(true); tag_node.isComment(false); mCurrentOffset += tag_node.length(); this->foundTag(tag_node, is_end_tag); } template _Iterator htmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end) { while ( c != end ) { if (*c++ == '-' && c != end && *c == '-') { _Iterator d(c); while (++c != end && isspace(*c)); if (c == end || *c++ == '>') break; c = d; } } return c; } namespace htmlcxx { namespace HTML { template static inline _Iterator find_next_quote(_Iterator c, _Iterator end, char quote) { // std::cerr << "generic find" << std::endl; while (c != end && *c != quote) ++c; return c; } template <> inline const char *find_next_quote(const char *c, const char *end, char quote) { // std::cerr << "fast find" << std::endl; const char *d = reinterpret_cast(memchr(c, quote, end - c)); if (d) return d; else return end; } }} template _Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end) { while (c != end && *c != '>') { if (*c != '=') { ++c; } else { // found an attribute ++c; while (c != end && isspace(*c)) ++c; if (c == end) break; if (*c == '\"' || *c == '\'') { _Iterator save(c); char quote = *c++; c = find_next_quote(c, end, quote); // while (c != end && *c != quote) ++c; // c = static_cast(memchr(c, quote, end - c)); if (c != end) { ++c; } else { c = save; ++c; } // DEBUGP("Quotes: %s\n", std::string(save, c).c_str()); } } } if (c != end) ++c; return c; }