summaryrefslogtreecommitdiffstats
path: root/reader/src/formats/html/HtmlReader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'reader/src/formats/html/HtmlReader.cpp')
-rw-r--r--reader/src/formats/html/HtmlReader.cpp373
1 files changed, 373 insertions, 0 deletions
diff --git a/reader/src/formats/html/HtmlReader.cpp b/reader/src/formats/html/HtmlReader.cpp
new file mode 100644
index 0000000..a5ce7fa
--- /dev/null
+++ b/reader/src/formats/html/HtmlReader.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <algorithm>
+#include <cctype>
+
+#include <ZLInputStream.h>
+#include <ZLXMLReader.h>
+#include <ZLFile.h>
+#include <ZLStringUtil.h>
+#include <ZLUnicodeUtil.h>
+
+#include "HtmlReader.h"
+#include "HtmlEntityCollection.h"
+
+HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
+}
+
+HtmlReader::~HtmlReader() {
+}
+
+void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
+ tag.Attributes.clear();
+
+ if (name.length() == 0) {
+ tag.Name = name;
+ return;
+ }
+
+ tag.Start = name[0] != '/';
+ if (tag.Start) {
+ tag.Name = name;
+ } else {
+ tag.Name = name.substr(1);
+ }
+
+ const std::size_t len = tag.Name.length();
+ for (std::size_t i = 0; i < len; ++i) {
+ tag.Name[i] = std::toupper(tag.Name[i]);
+ }
+}
+
+enum ParseState {
+ PS_TEXT,
+ PS_TAGSTART,
+ PS_TAGNAME,
+ PS_WAIT_END_OF_TAG,
+ PS_ATTRIBUTENAME,
+ PS_ATTRIBUTEVALUE,
+ PS_SKIPTAG,
+ PS_COMMENT,
+ PS_SPECIAL,
+ PS_SPECIAL_IN_ATTRIBUTEVALUE,
+};
+
+enum SpecialType {
+ ST_UNKNOWN,
+ ST_NUM,
+ ST_NAME,
+ ST_DEC,
+ ST_HEX
+};
+
+static bool allowSymbol(SpecialType type, char ch) {
+ return
+ (type == ST_NAME && std::isalpha(ch)) ||
+ (type == ST_DEC && std::isdigit(ch)) ||
+ (type == ST_HEX && std::isxdigit(ch));
+}
+
+static int specialSymbolNumber(SpecialType type, const std::string &txt) {
+ char *end = 0;
+ switch (type) {
+ case ST_NAME:
+ return HtmlEntityCollection::symbolNumber(txt);
+ case ST_DEC:
+ return std::strtol(txt.c_str() + 1, &end, 10);
+ case ST_HEX:
+ return std::strtol(txt.c_str() + 2, &end, 16);
+ default:
+ return 0;
+ }
+}
+
+void HtmlReader::appendString(std::string &to, std::string &from) {
+ if (myConverter.isNull()) {
+ to += from;
+ } else {
+ myConverter->convert(to, from);
+ myConverter->reset();
+ }
+ from.erase();
+}
+
+void HtmlReader::readDocument(ZLInputStream &stream) {
+ if (!stream.open()) {
+ return;
+ }
+
+ startDocumentHandler();
+
+ ParseState state = PS_TEXT;
+ SpecialType state_special = ST_UNKNOWN;
+ std::string currentString;
+ std::string attributeValueString;
+ std::string specialString;
+ int quotationCounter = 0;
+ HtmlTag currentTag;
+ char endOfComment[2] = "\0";
+
+ const std::size_t BUFSIZE = 2048;
+ char *buffer = new char[BUFSIZE];
+ std::size_t length;
+ std::size_t offset = 0;
+ do {
+ length = stream.read(buffer, BUFSIZE);
+ char *start = buffer;
+ char *endOfBuffer = buffer + length;
+ for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
+ switch (state) {
+ case PS_TEXT:
+ if (*ptr == '<') {
+ if (!characterDataHandler(start, ptr - start, true)) {
+ goto endOfProcessing;
+ }
+ start = ptr + 1;
+ state = PS_TAGSTART;
+ currentTag.Offset = offset + (ptr - buffer);
+ }
+ if (*ptr == '&') {
+ if (!characterDataHandler(start, ptr - start, true)) {
+ goto endOfProcessing;
+ }
+ start = ptr + 1;
+ state = PS_SPECIAL;
+ state_special = ST_UNKNOWN;
+ }
+ break;
+ case PS_SPECIAL:
+ case PS_SPECIAL_IN_ATTRIBUTEVALUE:
+ if (state_special == ST_UNKNOWN) {
+ if (*ptr == '#') {
+ state_special = ST_NUM;
+ } else if (std::isalpha(*ptr)) {
+ state_special = ST_NAME;
+ } else {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ } else if (state_special == ST_NUM) {
+ if (*ptr == 'x') {
+ state_special = ST_HEX;
+ } else if (std::isdigit(*ptr)) {
+ state_special = ST_DEC;
+ } else {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ } else {
+ if (*ptr == ';') {
+ specialString.append(start, ptr - start);
+ int number = specialSymbolNumber(state_special, specialString);
+ if ((128 <= number) && (number <= 159)) {
+ char ch = number;
+ if (state == PS_SPECIAL) {
+ characterDataHandler(&ch, 1, true);
+ } else {
+ myConverter->convert(attributeValueString, &ch, &ch + 1);
+ }
+ } else if (number != 0) {
+ char buffer[4];
+ int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
+ if (state == PS_SPECIAL) {
+ characterDataHandler(buffer, len, false);
+ } else {
+ attributeValueString.append(buffer, len);
+ }
+ } else {
+ specialString = "&" + specialString + ";";
+ if (state == PS_SPECIAL) {
+ characterDataHandler(specialString.c_str(), specialString.length(), false);
+ } else {
+ attributeValueString += specialString;
+ }
+ }
+ specialString.erase();
+ start = ptr + 1;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ } else if (!allowSymbol(state_special, *ptr)) {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ }
+ break;
+ case PS_TAGSTART:
+ state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
+ break;
+ case PS_COMMENT:
+ if ((endOfComment[0] == '\0') && (*ptr != '-')) {
+ state = PS_TAGNAME;
+ } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
+ start = ptr + 1;
+ state = PS_TEXT;
+ endOfComment[0] = '\0';
+ endOfComment[1] = '\0';
+ } else {
+ endOfComment[0] = endOfComment[1];
+ endOfComment[1] = *ptr;
+ }
+ break;
+ case PS_WAIT_END_OF_TAG:
+ if (*ptr == '>') {
+ start = ptr + 1;
+ state = PS_TEXT;
+ }
+ break;
+ case PS_TAGNAME:
+ if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
+ currentString.append(start, ptr - start);
+ start = ptr + 1;
+ setTag(currentTag, currentString);
+ currentString.erase();
+ if (currentTag.Name == "") {
+ state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
+ } else {
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = PS_ATTRIBUTENAME;
+ }
+ }
+ }
+ break;
+ case PS_ATTRIBUTENAME:
+ if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
+ if (ptr != start || !currentString.empty()) {
+ currentString.append(start, ptr - start);
+ for (unsigned int i = 0; i < currentString.length(); ++i) {
+ currentString[i] = std::toupper(currentString[i]);
+ }
+ currentTag.addAttribute(currentString);
+ currentString.erase();
+ }
+ start = ptr + 1;
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
+ }
+ }
+ break;
+ case PS_ATTRIBUTEVALUE:
+ if (*ptr == '"') {
+ if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
+ ++quotationCounter;
+ }
+ } else if (*ptr == '&') {
+ currentString.append(start, ptr - start);
+ start = ptr + 1;
+ appendString(attributeValueString, currentString);
+ state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
+ state_special = ST_UNKNOWN;
+ } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
+ if (ptr != start || !currentString.empty()) {
+ currentString.append(start, ptr - start);
+ appendString(attributeValueString, currentString);
+ if (attributeValueString[0] == '"') {
+ attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
+ }
+ currentTag.setLastAttributeValue(attributeValueString);
+ attributeValueString.erase();
+ quotationCounter = 0;
+ }
+ start = ptr + 1;
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = PS_ATTRIBUTENAME;
+ }
+ }
+ break;
+ case PS_SKIPTAG:
+ if (*ptr == '>') {
+ start = ptr + 1;
+ state = PS_TEXT;
+ }
+ break;
+ }
+ }
+ if (start != endOfBuffer) {
+ switch (state) {
+ case PS_TEXT:
+ if (!characterDataHandler(start, endOfBuffer - start, true)) {
+ goto endOfProcessing;
+ }
+ break;
+ case PS_TAGNAME:
+ case PS_ATTRIBUTENAME:
+ case PS_ATTRIBUTEVALUE:
+ currentString.append(start, endOfBuffer - start);
+ break;
+ case PS_SPECIAL:
+ case PS_SPECIAL_IN_ATTRIBUTEVALUE:
+ specialString.append(start, endOfBuffer - start);
+ break;
+ case PS_TAGSTART:
+ case PS_SKIPTAG:
+ case PS_COMMENT:
+ case PS_WAIT_END_OF_TAG:
+ break;
+ }
+ }
+ offset += length;
+ } while (length == BUFSIZE);
+endOfProcessing:
+ delete[] buffer;
+
+ endDocumentHandler();
+
+ stream.close();
+}