summaryrefslogtreecommitdiffstats
path: root/fbreader/src/formats/html/HtmlReader.cpp
diff options
context:
space:
mode:
authorMichele Calgaro <[email protected]>2024-06-07 23:30:05 +0900
committerMichele Calgaro <[email protected]>2024-06-07 23:30:05 +0900
commit17b259df9cb6b28779d4881b2b6c805ee2e48eea (patch)
tree5ed61937459cb7081089111b0242c01ec178f1f3 /fbreader/src/formats/html/HtmlReader.cpp
parent1cba8bce178eb2d6719c6f7f21e2c9352c5513a6 (diff)
downloadtde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.tar.gz
tde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.zip
Rename to tde-ebook-reader
Signed-off-by: Michele Calgaro <[email protected]>
Diffstat (limited to 'fbreader/src/formats/html/HtmlReader.cpp')
-rw-r--r--fbreader/src/formats/html/HtmlReader.cpp373
1 files changed, 0 insertions, 373 deletions
diff --git a/fbreader/src/formats/html/HtmlReader.cpp b/fbreader/src/formats/html/HtmlReader.cpp
deleted file mode 100644
index a5ce7fa..0000000
--- a/fbreader/src/formats/html/HtmlReader.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <algorithm>
-#include <cctype>
-
-#include <ZLInputStream.h>
-#include <ZLXMLReader.h>
-#include <ZLFile.h>
-#include <ZLStringUtil.h>
-#include <ZLUnicodeUtil.h>
-
-#include "HtmlReader.h"
-#include "HtmlEntityCollection.h"
-
-HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
-}
-
-HtmlReader::~HtmlReader() {
-}
-
-void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
- tag.Attributes.clear();
-
- if (name.length() == 0) {
- tag.Name = name;
- return;
- }
-
- tag.Start = name[0] != '/';
- if (tag.Start) {
- tag.Name = name;
- } else {
- tag.Name = name.substr(1);
- }
-
- const std::size_t len = tag.Name.length();
- for (std::size_t i = 0; i < len; ++i) {
- tag.Name[i] = std::toupper(tag.Name[i]);
- }
-}
-
-enum ParseState {
- PS_TEXT,
- PS_TAGSTART,
- PS_TAGNAME,
- PS_WAIT_END_OF_TAG,
- PS_ATTRIBUTENAME,
- PS_ATTRIBUTEVALUE,
- PS_SKIPTAG,
- PS_COMMENT,
- PS_SPECIAL,
- PS_SPECIAL_IN_ATTRIBUTEVALUE,
-};
-
-enum SpecialType {
- ST_UNKNOWN,
- ST_NUM,
- ST_NAME,
- ST_DEC,
- ST_HEX
-};
-
-static bool allowSymbol(SpecialType type, char ch) {
- return
- (type == ST_NAME && std::isalpha(ch)) ||
- (type == ST_DEC && std::isdigit(ch)) ||
- (type == ST_HEX && std::isxdigit(ch));
-}
-
-static int specialSymbolNumber(SpecialType type, const std::string &txt) {
- char *end = 0;
- switch (type) {
- case ST_NAME:
- return HtmlEntityCollection::symbolNumber(txt);
- case ST_DEC:
- return std::strtol(txt.c_str() + 1, &end, 10);
- case ST_HEX:
- return std::strtol(txt.c_str() + 2, &end, 16);
- default:
- return 0;
- }
-}
-
-void HtmlReader::appendString(std::string &to, std::string &from) {
- if (myConverter.isNull()) {
- to += from;
- } else {
- myConverter->convert(to, from);
- myConverter->reset();
- }
- from.erase();
-}
-
-void HtmlReader::readDocument(ZLInputStream &stream) {
- if (!stream.open()) {
- return;
- }
-
- startDocumentHandler();
-
- ParseState state = PS_TEXT;
- SpecialType state_special = ST_UNKNOWN;
- std::string currentString;
- std::string attributeValueString;
- std::string specialString;
- int quotationCounter = 0;
- HtmlTag currentTag;
- char endOfComment[2] = "\0";
-
- const std::size_t BUFSIZE = 2048;
- char *buffer = new char[BUFSIZE];
- std::size_t length;
- std::size_t offset = 0;
- do {
- length = stream.read(buffer, BUFSIZE);
- char *start = buffer;
- char *endOfBuffer = buffer + length;
- for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
- switch (state) {
- case PS_TEXT:
- if (*ptr == '<') {
- if (!characterDataHandler(start, ptr - start, true)) {
- goto endOfProcessing;
- }
- start = ptr + 1;
- state = PS_TAGSTART;
- currentTag.Offset = offset + (ptr - buffer);
- }
- if (*ptr == '&') {
- if (!characterDataHandler(start, ptr - start, true)) {
- goto endOfProcessing;
- }
- start = ptr + 1;
- state = PS_SPECIAL;
- state_special = ST_UNKNOWN;
- }
- break;
- case PS_SPECIAL:
- case PS_SPECIAL_IN_ATTRIBUTEVALUE:
- if (state_special == ST_UNKNOWN) {
- if (*ptr == '#') {
- state_special = ST_NUM;
- } else if (std::isalpha(*ptr)) {
- state_special = ST_NAME;
- } else {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- } else if (state_special == ST_NUM) {
- if (*ptr == 'x') {
- state_special = ST_HEX;
- } else if (std::isdigit(*ptr)) {
- state_special = ST_DEC;
- } else {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- } else {
- if (*ptr == ';') {
- specialString.append(start, ptr - start);
- int number = specialSymbolNumber(state_special, specialString);
- if ((128 <= number) && (number <= 159)) {
- char ch = number;
- if (state == PS_SPECIAL) {
- characterDataHandler(&ch, 1, true);
- } else {
- myConverter->convert(attributeValueString, &ch, &ch + 1);
- }
- } else if (number != 0) {
- char buffer[4];
- int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
- if (state == PS_SPECIAL) {
- characterDataHandler(buffer, len, false);
- } else {
- attributeValueString.append(buffer, len);
- }
- } else {
- specialString = "&" + specialString + ";";
- if (state == PS_SPECIAL) {
- characterDataHandler(specialString.c_str(), specialString.length(), false);
- } else {
- attributeValueString += specialString;
- }
- }
- specialString.erase();
- start = ptr + 1;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- } else if (!allowSymbol(state_special, *ptr)) {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- }
- break;
- case PS_TAGSTART:
- state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
- break;
- case PS_COMMENT:
- if ((endOfComment[0] == '\0') && (*ptr != '-')) {
- state = PS_TAGNAME;
- } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
- start = ptr + 1;
- state = PS_TEXT;
- endOfComment[0] = '\0';
- endOfComment[1] = '\0';
- } else {
- endOfComment[0] = endOfComment[1];
- endOfComment[1] = *ptr;
- }
- break;
- case PS_WAIT_END_OF_TAG:
- if (*ptr == '>') {
- start = ptr + 1;
- state = PS_TEXT;
- }
- break;
- case PS_TAGNAME:
- if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
- currentString.append(start, ptr - start);
- start = ptr + 1;
- setTag(currentTag, currentString);
- currentString.erase();
- if (currentTag.Name == "") {
- state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
- } else {
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = PS_ATTRIBUTENAME;
- }
- }
- }
- break;
- case PS_ATTRIBUTENAME:
- if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
- if (ptr != start || !currentString.empty()) {
- currentString.append(start, ptr - start);
- for (unsigned int i = 0; i < currentString.length(); ++i) {
- currentString[i] = std::toupper(currentString[i]);
- }
- currentTag.addAttribute(currentString);
- currentString.erase();
- }
- start = ptr + 1;
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
- }
- }
- break;
- case PS_ATTRIBUTEVALUE:
- if (*ptr == '"') {
- if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
- ++quotationCounter;
- }
- } else if (*ptr == '&') {
- currentString.append(start, ptr - start);
- start = ptr + 1;
- appendString(attributeValueString, currentString);
- state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
- state_special = ST_UNKNOWN;
- } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
- if (ptr != start || !currentString.empty()) {
- currentString.append(start, ptr - start);
- appendString(attributeValueString, currentString);
- if (attributeValueString[0] == '"') {
- attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
- }
- currentTag.setLastAttributeValue(attributeValueString);
- attributeValueString.erase();
- quotationCounter = 0;
- }
- start = ptr + 1;
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = PS_ATTRIBUTENAME;
- }
- }
- break;
- case PS_SKIPTAG:
- if (*ptr == '>') {
- start = ptr + 1;
- state = PS_TEXT;
- }
- break;
- }
- }
- if (start != endOfBuffer) {
- switch (state) {
- case PS_TEXT:
- if (!characterDataHandler(start, endOfBuffer - start, true)) {
- goto endOfProcessing;
- }
- break;
- case PS_TAGNAME:
- case PS_ATTRIBUTENAME:
- case PS_ATTRIBUTEVALUE:
- currentString.append(start, endOfBuffer - start);
- break;
- case PS_SPECIAL:
- case PS_SPECIAL_IN_ATTRIBUTEVALUE:
- specialString.append(start, endOfBuffer - start);
- break;
- case PS_TAGSTART:
- case PS_SKIPTAG:
- case PS_COMMENT:
- case PS_WAIT_END_OF_TAG:
- break;
- }
- }
- offset += length;
- } while (length == BUFSIZE);
-endOfProcessing:
- delete[] buffer;
-
- endDocumentHandler();
-
- stream.close();
-}