reimu/htmlcxx/html/ParserSax.tcc

#include <cctype>
#include <cstring>
/*#if !defined(WIN32) || defined(__MINGW32__)
#include <strings.h>
#endif*/

//#define DEBUG
//#include "debug.h"

static
struct literal_tag {
	int len;
	const char* str;
	int is_cdata;
}
literal_mode_elem[] =
{
	{6, "script", 1},
	{5, "style", 1},
	{3, "xmp", 1},
	{9, "plaintext", 1},
	{8, "textarea", 0},
	{0, 0, 0}
};

template <typename _Iterator>
void htmlcxx::HTML::ParserSax::parse(_Iterator begin, _Iterator end)
{
//	std::cerr << "Parsing iterator" << std::endl;
	parse(begin, end, typename std::iterator_traits<_Iterator>::iterator_category());
}

template <typename _Iterator>
void htmlcxx::HTML::ParserSax::parse(_Iterator &begin, _Iterator &end, std::forward_iterator_tag)
{
	typedef _Iterator iterator;
//	std::cerr << "Parsing forward_iterator" << std::endl;
	mCdata = false;
	mpLiteral = 0;
	mCurrentOffset = 0;
	this->beginParsing();

//	DEBUGP("Parsed text\n");

	while (begin != end)
	{
		*begin; // This is for the multi_pass to release the buffer
		iterator c(begin);

		while (c != end)
		{
			// For some tags, the text inside it is considered literal and is
			// only closed for its </TAG> counterpart
			while (mpLiteral)
			{
//				DEBUGP("Treating literal %s\n", mpLiteral);
				while (c != end && *c != '<') ++c;

				if (c == end) {
					if (c != begin) this->parseContent(begin, c);
					goto DONE;
				}

				iterator end_text(c);
				++c;

				if (*c == '/')
				{
					++c;
					const char *l = mpLiteral;
					while (*l && ::tolower(*c) == *l)
					{
						++c;
						++l;
					}

					// FIXME: Mozilla stops when it sees a /plaintext. Check
					// other browsers and decide what to do
					if (!*l && strcmp(mpLiteral, "plaintext"))
					{
						// matched all and is not tag plaintext
						while (isspace(*c)) ++c;

						if (*c == '>')
						{
							++c;
							if (begin != end_text)
								this->parseContent(begin, end_text);
							mpLiteral = 0;
							c = end_text;
							begin = c;
							break;
						}
					}
				}
				else if (*c == '!')
				{
					// we may find a comment and we should support it
					iterator e(c);
					++e;

					if (e != end && *e == '-' && ++e != end && *e == '-')
					{
//						DEBUGP("Parsing comment\n");
						++e;
						c = this->skipHtmlComment(e, end);
					}

					//if (begin != end_text)
					//this->parseContent(begin, end_text, end);

					//this->parseComment(end_text, c, end);

					// continue from the end of the comment
					//begin = c;
				}
			}

			if (*c == '<')
			{
				iterator d(c);
				++d;
				if (d != end)
				{
					if (isalpha(*d))
					{
						// beginning of tag
						if (begin != c)
							this->parseContent(begin, c);

//						DEBUGP("Parsing beginning of tag\n");
						d = this->skipHtmlTag(d, end);
						this->parseHtmlTag(c, d);

						// continue from the end of the tag
						c = d;
						begin = c;
						break;
					}

					if (*d == '/')
					{
						if (begin != c)
							this->parseContent(begin, c);

						iterator e(d);
						++e;
						if (e != end && isalpha(*e))
						{
							// end of tag
//							DEBUGP("Parsing end of tag\n");
							d = this->skipHtmlTag(d, end);
							this->parseHtmlTag(c, d);
						}
						else
						{
							// not a conforming end of tag, treat as comment
							// as Mozilla does
//							DEBUGP("Non conforming end of tag\n");
							d = this->skipHtmlTag(d, end);
							this->parseComment(c, d);
						}

						// continue from the end of the tag
						c = d;
						begin = c;
						break;
					}

					if (*d == '!')
					{
						// comment
						if (begin != c)
							this->parseContent(begin, c);

						iterator e(d);
						++e;

						if (e != end && *e == '-' && ++e != end && *e == '-')
						{
//							DEBUGP("Parsing comment\n");
							++e;
							d = this->skipHtmlComment(e, end);
						}
						else
						{
							d = this->skipHtmlTag(d, end);
						}

						this->parseComment(c, d);

						// continue from the end of the comment
						c = d;
						begin = c;
						break;
					}

					if (*d == '?' || *d == '%')
					{
						// something like <?xml or <%VBSCRIPT
						if (begin != c)
							this->parseContent(begin, c);

						d = this->skipHtmlTag(d, end);

						this->parseComment(c, d);

						// continue from the end of the comment
						c = d;
						begin = c;
						break;
					}
				}
			}
			c++;
		}

		// There may be some text in the end of the document
		if (begin != c)
		{
			this->parseContent(begin, c);
			begin = c;
		}
	}

DONE:
	this->endParsing();
	return;
}

template <typename _Iterator>
void htmlcxx::HTML::ParserSax::parseComment(_Iterator b, _Iterator c)
{
//	DEBUGP("Creating comment node %s\n", std::string(b, c).c_str());
	htmlcxx::HTML::Node com_node;
	//FIXME: set_tagname shouldn't be needed, but first I must check
	//legacy code
	std::string comment(b, c);
	com_node.tagName(comment);
	com_node.text(comment);
	com_node.offset(mCurrentOffset);
	com_node.length((unsigned int)comment.length());
	com_node.isTag(false);
	com_node.isComment(true);

	mCurrentOffset += com_node.length();

	// Call callback method
	this->foundComment(com_node);
}

template <typename _Iterator>
void htmlcxx::HTML::ParserSax::parseContent(_Iterator b, _Iterator c)
{
//	DEBUGP("Creating text node %s\n", (std::string(b, c)).c_str());
	htmlcxx::HTML::Node txt_node;
	//FIXME: set_tagname shouldn't be needed, but first I must check
	//legacy code
	std::string text(b, c);
	txt_node.tagName(text);
	txt_node.text(text);
	txt_node.offset(mCurrentOffset);
	txt_node.length((unsigned int)text.length());
	txt_node.isTag(false);
	txt_node.isComment(false);

	mCurrentOffset += txt_node.length();

	// Call callback method
	this->foundText(txt_node);
}

template <typename _Iterator>
void htmlcxx::HTML::ParserSax::parseHtmlTag(_Iterator b, _Iterator c)
{
	_Iterator name_begin(b);
	++name_begin;
	bool is_end_tag = (*name_begin == '/');
	if (is_end_tag) ++name_begin;

	_Iterator name_end(name_begin);
	while (name_end != c && isalnum(*name_end))
	{
		++name_end;
	}

	std::string name(name_begin, name_end);
//	DEBUGP("Found %s tag %s\n", is_end_tag ? "closing" : "opening", name.c_str());

	if (!is_end_tag)
	{
		std::string::size_type tag_len = name.length();
		for (int i = 0; literal_mode_elem[i].len; ++i)
		{
			if (tag_len == literal_mode_elem[i].len)
			{
                                #if defined(WIN32) && !defined(__MINGW32__)
				if (!_stricmp(name.c_str(), literal_mode_elem[i].str))
				#else
				if (!strcasecmp(name.c_str(), literal_mode_elem[i].str))
				#endif
				{
					mpLiteral = literal_mode_elem[i].str;
					break;
				}
			}
		}
	}

	htmlcxx::HTML::Node tag_node;
	//by now, length is just the size of the tag
	std::string text(b, c);
	tag_node.length(static_cast<unsigned int>(text.length()));
	tag_node.tagName(name);
	tag_node.text(text);
	tag_node.offset(mCurrentOffset);
	tag_node.isTag(true);
	tag_node.isComment(false);

	mCurrentOffset += tag_node.length();

	this->foundTag(tag_node, is_end_tag);
}

template <typename _Iterator>
_Iterator
htmlcxx::HTML::ParserSax::skipHtmlComment(_Iterator c, _Iterator end)
{
	while ( c != end ) {
		if (*c++ == '-' && c != end && *c == '-')
		{
			_Iterator d(c);
			while (++c != end && isspace(*c));
			if (c == end || *c++ == '>') break;
			c = d;
		}
	}

	return c;
}

namespace htmlcxx { namespace HTML {

template <typename _Iterator>
static inline
_Iterator find_next_quote(_Iterator c, _Iterator end, char quote)
{
//	std::cerr << "generic find" << std::endl;
	while (c != end && *c != quote) ++c;
	return c;
}

template <>
inline
const char *find_next_quote(const char *c, const char *end, char quote)
{
//	std::cerr << "fast find" << std::endl;
	const char *d = reinterpret_cast<const char*>(memchr(c, quote, end - c));

	if (d) return d;
	else return end;
}

}}

template <typename _Iterator>
_Iterator htmlcxx::HTML::ParserSax::skipHtmlTag(_Iterator c, _Iterator end)
{
	while (c != end && *c != '>')
	{
		if (*c != '=')
		{
			++c;
		}
		else
		{ // found an attribute
			++c;
			while (c != end && isspace(*c)) ++c;

			if (c == end) break;

			if (*c == '\"' || *c == '\'')
			{
				_Iterator save(c);
				char quote = *c++;
				c = find_next_quote(c, end, quote);
//				while (c != end && *c != quote) ++c;
//				c = static_cast<char*>(memchr(c, quote, end - c));
				if (c != end)
				{
					++c;
				}
				else
				{
					c = save;
					++c;
				}
//				DEBUGP("Quotes: %s\n", std::string(save, c).c_str());
			}
		}
	}

	if (c != end) ++c;

	return c;
}