diff options
Diffstat (limited to 'reader/src/formats/html/HtmlReader.cpp')
-rw-r--r-- | reader/src/formats/html/HtmlReader.cpp | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/reader/src/formats/html/HtmlReader.cpp b/reader/src/formats/html/HtmlReader.cpp new file mode 100644 index 0000000..a5ce7fa --- /dev/null +++ b/reader/src/formats/html/HtmlReader.cpp @@ -0,0 +1,373 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> +#include <cctype> + +#include <ZLInputStream.h> +#include <ZLXMLReader.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> + +#include "HtmlReader.h" +#include "HtmlEntityCollection.h" + +HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) { +} + +HtmlReader::~HtmlReader() { +} + +void HtmlReader::setTag(HtmlTag &tag, const std::string &name) { + tag.Attributes.clear(); + + if (name.length() == 0) { + tag.Name = name; + return; + } + + tag.Start = name[0] != '/'; + if (tag.Start) { + tag.Name = name; + } else { + tag.Name = name.substr(1); + } + + const std::size_t len = tag.Name.length(); + for (std::size_t i = 0; i < len; ++i) { + tag.Name[i] = std::toupper(tag.Name[i]); + } +} + +enum ParseState { + PS_TEXT, + PS_TAGSTART, + PS_TAGNAME, + PS_WAIT_END_OF_TAG, + PS_ATTRIBUTENAME, + PS_ATTRIBUTEVALUE, + PS_SKIPTAG, + PS_COMMENT, + PS_SPECIAL, + PS_SPECIAL_IN_ATTRIBUTEVALUE, +}; + +enum SpecialType { + ST_UNKNOWN, + ST_NUM, + ST_NAME, + ST_DEC, + ST_HEX +}; + +static bool allowSymbol(SpecialType type, char ch) { + return + (type == ST_NAME && std::isalpha(ch)) || + (type == ST_DEC && std::isdigit(ch)) || + (type == ST_HEX && std::isxdigit(ch)); +} + +static int specialSymbolNumber(SpecialType type, const std::string &txt) { + char *end = 0; + switch (type) { + case ST_NAME: + return HtmlEntityCollection::symbolNumber(txt); + case ST_DEC: + return std::strtol(txt.c_str() + 1, &end, 10); + case ST_HEX: + return std::strtol(txt.c_str() + 2, &end, 16); + default: + return 0; + } +} + +void HtmlReader::appendString(std::string &to, std::string &from) { + if (myConverter.isNull()) { + to += from; + } else { + myConverter->convert(to, from); + myConverter->reset(); + } + from.erase(); +} + +void HtmlReader::readDocument(ZLInputStream &stream) { + if (!stream.open()) { + return; + } + + startDocumentHandler(); + + ParseState state = PS_TEXT; + SpecialType state_special = ST_UNKNOWN; + std::string currentString; + std::string attributeValueString; + std::string specialString; + int quotationCounter = 0; + HtmlTag currentTag; + char endOfComment[2] = "\0"; + + const std::size_t BUFSIZE = 2048; + char *buffer = new char[BUFSIZE]; + std::size_t length; + std::size_t offset = 0; + do { + length = stream.read(buffer, BUFSIZE); + char *start = buffer; + char *endOfBuffer = buffer + length; + for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { + switch (state) { + case PS_TEXT: + if (*ptr == '<') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_TAGSTART; + currentTag.Offset = offset + (ptr - buffer); + } + if (*ptr == '&') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_SPECIAL; + state_special = ST_UNKNOWN; + } + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + if (state_special == ST_UNKNOWN) { + if (*ptr == '#') { + state_special = ST_NUM; + } else if (std::isalpha(*ptr)) { + state_special = ST_NAME; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else if (state_special == ST_NUM) { + if (*ptr == 'x') { + state_special = ST_HEX; + } else if (std::isdigit(*ptr)) { + state_special = ST_DEC; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else { + if (*ptr == ';') { + specialString.append(start, ptr - start); + int number = specialSymbolNumber(state_special, specialString); + if ((128 <= number) && (number <= 159)) { + char ch = number; + if (state == PS_SPECIAL) { + characterDataHandler(&ch, 1, true); + } else { + myConverter->convert(attributeValueString, &ch, &ch + 1); + } + } else if (number != 0) { + char buffer[4]; + int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); + if (state == PS_SPECIAL) { + characterDataHandler(buffer, len, false); + } else { + attributeValueString.append(buffer, len); + } + } else { + specialString = "&" + specialString + ";"; + if (state == PS_SPECIAL) { + characterDataHandler(specialString.c_str(), specialString.length(), false); + } else { + attributeValueString += specialString; + } + } + specialString.erase(); + start = ptr + 1; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } else if (!allowSymbol(state_special, *ptr)) { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } + break; + case PS_TAGSTART: + state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; + break; + case PS_COMMENT: + if ((endOfComment[0] == '\0') && (*ptr != '-')) { + state = PS_TAGNAME; + } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { + start = ptr + 1; + state = PS_TEXT; + endOfComment[0] = '\0'; + endOfComment[1] = '\0'; + } else { + endOfComment[0] = endOfComment[1]; + endOfComment[1] = *ptr; + } + break; + case PS_WAIT_END_OF_TAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + case PS_TAGNAME: + if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { + currentString.append(start, ptr - start); + start = ptr + 1; + setTag(currentTag, currentString); + currentString.erase(); + if (currentTag.Name == "") { + state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; + } else { + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + } + break; + case PS_ATTRIBUTENAME: + if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + for (unsigned int i = 0; i < currentString.length(); ++i) { + currentString[i] = std::toupper(currentString[i]); + } + currentTag.addAttribute(currentString); + currentString.erase(); + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; + } + } + break; + case PS_ATTRIBUTEVALUE: + if (*ptr == '"') { + if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { + ++quotationCounter; + } + } else if (*ptr == '&') { + currentString.append(start, ptr - start); + start = ptr + 1; + appendString(attributeValueString, currentString); + state = PS_SPECIAL_IN_ATTRIBUTEVALUE; + state_special = ST_UNKNOWN; + } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + appendString(attributeValueString, currentString); + if (attributeValueString[0] == '"') { + attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); + } + currentTag.setLastAttributeValue(attributeValueString); + attributeValueString.erase(); + quotationCounter = 0; + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + break; + case PS_SKIPTAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + } + } + if (start != endOfBuffer) { + switch (state) { + case PS_TEXT: + if (!characterDataHandler(start, endOfBuffer - start, true)) { + goto endOfProcessing; + } + break; + case PS_TAGNAME: + case PS_ATTRIBUTENAME: + case PS_ATTRIBUTEVALUE: + currentString.append(start, endOfBuffer - start); + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + specialString.append(start, endOfBuffer - start); + break; + case PS_TAGSTART: + case PS_SKIPTAG: + case PS_COMMENT: + case PS_WAIT_END_OF_TAG: + break; + } + } + offset += length; + } while (length == BUFSIZE); +endOfProcessing: + delete[] buffer; + + endDocumentHandler(); + + stream.close(); +} |