summaryrefslogtreecommitdiffstats
path: root/fbreader/src/formats/html
diff options
context:
space:
mode:
Diffstat (limited to 'fbreader/src/formats/html')
-rw-r--r--fbreader/src/formats/html/HtmlBookReader.cpp583
-rw-r--r--fbreader/src/formats/html/HtmlBookReader.h101
-rw-r--r--fbreader/src/formats/html/HtmlDescriptionReader.cpp82
-rw-r--r--fbreader/src/formats/html/HtmlDescriptionReader.h48
-rw-r--r--fbreader/src/formats/html/HtmlEntityCollection.cpp71
-rw-r--r--fbreader/src/formats/html/HtmlEntityCollection.h38
-rw-r--r--fbreader/src/formats/html/HtmlPlugin.cpp83
-rw-r--r--fbreader/src/formats/html/HtmlPlugin.h42
-rw-r--r--fbreader/src/formats/html/HtmlReader.cpp373
-rw-r--r--fbreader/src/formats/html/HtmlReader.h92
-rw-r--r--fbreader/src/formats/html/HtmlReaderStream.cpp128
-rw-r--r--fbreader/src/formats/html/HtmlReaderStream.h48
-rw-r--r--fbreader/src/formats/html/HtmlTagActions.h158
13 files changed, 0 insertions, 1847 deletions
diff --git a/fbreader/src/formats/html/HtmlBookReader.cpp b/fbreader/src/formats/html/HtmlBookReader.cpp
deleted file mode 100644
index 321913d..0000000
--- a/fbreader/src/formats/html/HtmlBookReader.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <cctype>
-
-#include <ZLFile.h>
-#include <ZLFileImage.h>
-#include <ZLStringUtil.h>
-
-#include "HtmlBookReader.h"
-#include "HtmlTagActions.h"
-#include "../txt/PlainTextFormat.h"
-#include "../util/MiscUtil.h"
-#include "../../bookmodel/BookModel.h"
-#include "../css/StyleSheetParser.h"
-
-HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
-}
-
-HtmlTagAction::~HtmlTagAction() {
-}
-
-void HtmlTagAction::reset() {
-}
-
-DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
-}
-
-HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
-}
-
-void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
- std::vector<FBTextKind> &list = myReader.myKindList;
- int index;
- for (index = list.size() - 1; index >= 0; --index) {
- if (list[index] == myKind) {
- break;
- }
- }
- if (tag.Start) {
- if (index == -1) {
- bookReader().pushKind(myKind);
- myReader.myKindList.push_back(myKind);
- bookReader().addControl(myKind, true);
- }
- } else {
- if (index >= 0) {
- for (int i = list.size() - 1; i >= index; --i) {
- bookReader().addControl(list[i], false);
- bookReader().popKind();
- }
- for (unsigned int j = index + 1; j < list.size(); ++j) {
- bookReader().addControl(list[j], true);
- bookReader().pushKind(list[j]);
- }
- list.erase(list.begin() + index);
- }
- }
-}
-
-HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
-}
-
-void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
- myReader.myIsStarted = false;
- if (tag.Start) {
- if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
- if (!bookReader().contentsParagraphIsOpen()) {
- bookReader().insertEndOfSectionParagraph();
- bookReader().enterTitle();
- bookReader().beginContentsParagraph();
- }
- }
- bookReader().pushKind(myKind);
- } else {
- bookReader().popKind();
- if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
- bookReader().endContentsParagraph();
- bookReader().exitTitle();
- }
- }
- bookReader().beginParagraph();
-}
-
-HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- if (myTagNames.find(tag.Name) == myTagNames.end()) {
- ++myReader.myIgnoreDataCounter;
- myTagNames.insert(tag.Name);
- }
- } else {
- if (myTagNames.find(tag.Name) != myTagNames.end()) {
- --myReader.myIgnoreDataCounter;
- myTagNames.erase(tag.Name);
- }
- }
-}
-
-HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "NAME") {
- bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
- } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) {
- std::string value = tag.Attributes[i].Value;
- if (!myReader.myFileName.empty() &&
- (value.length() > myReader.myFileName.length()) &&
- (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
- value = value.substr(myReader.myFileName.length());
- }
- if (!value.empty()) {
- if (value[0] == '#') {
- setHyperlinkType(INTERNAL_HYPERLINK);
- bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
- } else {
- FBTextKind hyperlinkType = MiscUtil::referenceType(value);
- if (hyperlinkType != INTERNAL_HYPERLINK) {
- setHyperlinkType(hyperlinkType);
- bookReader().addHyperlinkControl(hyperlinkType, value);
- }
- }
- }
- }
- }
- } else if (hyperlinkType() != REGULAR) {
- bookReader().addControl(hyperlinkType(), false);
- setHyperlinkType(REGULAR);
- }
-}
-
-void HtmlHrefTagAction::reset() {
- setHyperlinkType(REGULAR);
-}
-
-FBTextKind HtmlHrefTagAction::hyperlinkType() const {
- return myHyperlinkType;
-}
-
-void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
- myHyperlinkType = hyperlinkType;
-}
-
-HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- bookReader().endParagraph();
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "SRC") {
- const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value);
- const ZLFile file(myReader.myBaseDirPath + fileName);
- if (file.exists()) {
- bookReader().addImageReference(fileName);
- bookReader().addImage(fileName, new ZLFileImage(file, 0));
- }
- break;
- }
- }
- bookReader().beginParagraph();
- }
-}
-
-HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
-}
-
-void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (myReader.myDontBreakParagraph) {
- myReader.myDontBreakParagraph = false;
- return;
- }
-
- if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
- (!tag.Start && (myBreakType & BREAK_AT_END))) {
- bookReader().endParagraph();
- if (bookReader().isKindStackEmpty()) {
- bookReader().pushKind(REGULAR);
- }
- bookReader().beginParagraph();
- }
-}
-
-HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
- bookReader().endParagraph();
- myReader.myIsPreformatted = tag.Start;
- myReader.mySpaceCounter = -1;
- myReader.myBreakCounter = 0;
- if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
- if (tag.Start) {
- bookReader().pushKind(PREFORMATTED);
- } else {
- bookReader().popKind();
- }
- }
- bookReader().beginParagraph();
-}
-
-HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
-}
-
-void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- myReader.myListNumStack.push(myStartIndex);
- } else if (!myReader.myListNumStack.empty()) {
- myReader.myListNumStack.pop();
- }
-}
-
-HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- bookReader().endParagraph();
- bookReader().beginParagraph();
- if (!myReader.myListNumStack.empty()) {
- bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
- int &index = myReader.myListNumStack.top();
- if (index == 0) {
- myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false);
- } else {
- std::string number;
- ZLStringUtil::appendNumber(number, index++);
- number += ". ";
- myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
- }
- myReader.myDontBreakParagraph = true;
- }
- } else {
- myReader.myDontBreakParagraph = false;
- }
-}
-
-HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
- if (tag.Start) {
- myReader.myIgnoreTitles = true;
- } else {
- myReader.myIgnoreTitles = false;
- }
-}
-
-HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
-}
-
-void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
- myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0;
- /*
- if (!tag.Start) {
- myReader.myStyleSheetTable.dump();
- }
- */
-}
-
-shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
- if (tag == "EM") {
- return new HtmlControlTagAction(*this, EMPHASIS);
- } else if (tag == "STRONG") {
- return new HtmlControlTagAction(*this, STRONG);
- } else if (tag == "B") {
- return new HtmlControlTagAction(*this, BOLD);
- } else if (tag == "I") {
- return new HtmlControlTagAction(*this, ITALIC);
- } else if (tag == "TT") {
- return new HtmlControlTagAction(*this, CODE);
- } else if (tag == "CODE") {
- return new HtmlControlTagAction(*this, CODE);
- } else if (tag == "CITE") {
- return new HtmlControlTagAction(*this, CITE);
- } else if (tag == "SUB") {
- return new HtmlControlTagAction(*this, SUB);
- } else if (tag == "SUP") {
- return new HtmlControlTagAction(*this, SUP);
- } else if (tag == "H1") {
- return new HtmlHeaderTagAction(*this, H1);
- } else if (tag == "H2") {
- return new HtmlHeaderTagAction(*this, H2);
- } else if (tag == "H3") {
- return new HtmlHeaderTagAction(*this, H3);
- } else if (tag == "H4") {
- return new HtmlHeaderTagAction(*this, H4);
- } else if (tag == "H5") {
- return new HtmlHeaderTagAction(*this, H5);
- } else if (tag == "H6") {
- return new HtmlHeaderTagAction(*this, H6);
- } else if (tag == "HEAD") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "TITLE") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "STYLE") {
- return new HtmlStyleTagAction(*this);
- } else if (tag == "SELECT") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "SCRIPT") {
- return new HtmlIgnoreTagAction(*this);
- } else if (tag == "A") {
- return new HtmlHrefTagAction(*this);
- } else if (tag == "TD") {
- //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "TR") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "DIV") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
- } else if (tag == "DT") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
- } else if (tag == "P") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
- } else if (tag == "BR") {
- return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
- } else if (tag == "IMG") {
- return new HtmlImageTagAction(*this);
- } else if (tag == "UL") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "MENU") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "DIR") {
- return new HtmlListTagAction(*this, 0);
- } else if (tag == "OL") {
- return new HtmlListTagAction(*this, 1);
- } else if (tag == "LI") {
- return new HtmlListItemTagAction(*this);
- } else if (tag == "PRE") {
- if (myProcessPreTag) {
- return new HtmlPreTagAction(*this);
- }
- } else if (tag == "TABLE") {
- return new HtmlTableTagAction(*this);
- }
- /*
- } else if (tag == "DD") {
- return 0;
- } else if (tag == "DL") {
- return 0;
- } else if (tag == "DFN") {
- return 0;
- } else if (tag == "SAMP") {
- return 0;
- } else if (tag == "KBD") {
- return 0;
- } else if (tag == "VAR") {
- return 0;
- } else if (tag == "ABBR") {
- return 0;
- } else if (tag == "ACRONYM") {
- return 0;
- } else if (tag == "BLOCKQUOTE") {
- return 0;
- } else if (tag == "Q") {
- return 0;
- } else if (tag == "INS") {
- return 0;
- } else if (tag == "DEL") {
- return 0;
- } else if (tag == "BODY") {
- return 0;
- */
- return new DummyHtmlTagAction(*this);
-}
-
-void HtmlBookReader::setBuildTableOfContent(bool build) {
- myBuildTableOfContent = build;
-}
-
-void HtmlBookReader::setProcessPreTag(bool process) {
- myProcessPreTag = process;
-}
-
-HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
-}
-
-HtmlBookReader::~HtmlBookReader() {
-}
-
-void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) {
- if (len > 0) {
- if (myDontBreakParagraph) {
- while (len > 0 && std::isspace(*text)) {
- --len;
- ++text;
- }
- if (len == 0) {
- return;
- }
- }
- if (convert) {
- myConverter->convert(myConverterBuffer, text, text + len);
- myBookReader.addData(myConverterBuffer);
- myBookReader.addContentsData(myConverterBuffer);
- myConverterBuffer.erase();
- } else {
- std::string strText(text, len);
- myBookReader.addData(strText);
- myBookReader.addContentsData(strText);
- }
- myDontBreakParagraph = false;
- }
-}
-
-bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
- myConverter->reset();
-
- for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
- if (tag.Attributes[i].Name == "ID") {
- myBookReader.addHyperlinkLabel(tag.Attributes[i].Value);
- break;
- }
- }
- shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
- if (action.isNull()) {
- action = createAction(tag.Name);
- myActionMap[tag.Name] = action;
- }
- action->run(tag);
-
- return true;
-}
-
-void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) {
- const char *start = text;
- const char *end = text + len;
-
- int breakType = myFormat.breakType();
- if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
- for (const char *ptr = text; ptr != end; ++ptr) {
- if (*ptr == '\n') {
- mySpaceCounter = 0;
- if (start < ptr) {
- addConvertedDataToBuffer(start, ptr - start, convert);
- } else {
- static const std::string SPACE = " ";
- myBookReader.addData(SPACE);
- }
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- start = ptr + 1;
- } else if (mySpaceCounter >= 0) {
- if (std::isspace((unsigned char)*ptr)) {
- ++mySpaceCounter;
- } else {
- myBookReader.addFixedHSpace(mySpaceCounter);
- mySpaceCounter = -1;
- }
- }
- }
- addConvertedDataToBuffer(start, end - start, convert);
- } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
- for (const char *ptr = text; ptr != end; ++ptr) {
- if (std::isspace((unsigned char)*ptr)) {
- if (*ptr == '\n') {
- mySpaceCounter = 0;
- } else if (mySpaceCounter >= 0) {
- ++mySpaceCounter;
- }
- } else {
- if (mySpaceCounter > myFormat.ignoredIndent()) {
- if (ptr - start > mySpaceCounter) {
- addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- }
- start = ptr;
- }
- mySpaceCounter = -1;
- }
- }
- mySpaceCounter = std::max(mySpaceCounter, 0);
- if (end - start > mySpaceCounter) {
- addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
- }
- } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
- for (const char *ptr = start; ptr != end; ++ptr) {
- if (std::isspace((unsigned char)*ptr)) {
- if (*ptr == '\n') {
- ++myBreakCounter;
- }
- } else {
- if (myBreakCounter > 1) {
- addConvertedDataToBuffer(start, ptr - start, convert);
- myBookReader.endParagraph();
- myBookReader.beginParagraph();
- start = ptr;
- }
- myBreakCounter = 0;
- }
- }
- addConvertedDataToBuffer(start, end - start, convert);
- }
-}
-
-bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) {
- if (!myStyleSheetParser.isNull()) {
- myStyleSheetParser->parse(text, len);
- return true;
- }
-
- if (myIgnoreDataCounter != 0) {
- return true;
- }
-
- if (myIsPreformatted) {
- preformattedCharacterDataHandler(text, len, convert);
- return true;
- }
-
- const char *ptr = text;
- const char *end = text + len;
- if (!myIsStarted) {
- for (; ptr != end; ++ptr) {
- if (!std::isspace((unsigned char)*ptr)) {
- myIsStarted = true;
- break;
- }
- }
- }
- if (myIsStarted) {
- addConvertedDataToBuffer(ptr, end - ptr, convert);
- }
- return true;
-}
-
-void HtmlBookReader::startDocumentHandler() {
- while (!myListNumStack.empty()) {
- myListNumStack.pop();
- }
- myConverterBuffer.erase();
- myKindList.clear();
-
- myBookReader.reset();
- myBookReader.setMainTextModel();
- myBookReader.pushKind(REGULAR);
- myBookReader.beginParagraph();
- myIgnoreDataCounter = 0;
- myIsPreformatted = false;
- myDontBreakParagraph = false;
- for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
- it->second->reset();
- }
- myIsStarted = false;
- myIgnoreTitles = false;
-
- myStyleSheetParser = 0;
-
- mySpaceCounter = -1;
- myBreakCounter = 0;
-}
-
-void HtmlBookReader::endDocumentHandler() {
- myBookReader.endParagraph();
-}
-
-void HtmlBookReader::setFileName(const std::string fileName) {
- myFileName = fileName;
-}
diff --git a/fbreader/src/formats/html/HtmlBookReader.h b/fbreader/src/formats/html/HtmlBookReader.h
deleted file mode 100644
index c8d4e32..0000000
--- a/fbreader/src/formats/html/HtmlBookReader.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLBOOKREADER_H__
-#define __HTMLBOOKREADER_H__
-
-#include <stack>
-
-#include <shared_ptr.h>
-
-#include "HtmlReader.h"
-#include "../../bookmodel/BookReader.h"
-#include "../css/StyleSheetTable.h"
-
-class BookModel;
-class PlainTextFormat;
-class StyleSheetParser;
-
-class HtmlTagAction;
-
-class HtmlBookReader : public HtmlReader {
-
-public:
- HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding);
- ~HtmlBookReader();
- void setFileName(const std::string fileName);
-
-protected:
- virtual shared_ptr<HtmlTagAction> createAction(const std::string &tag);
- void setBuildTableOfContent(bool build);
- void setProcessPreTag(bool process);
-
-protected:
- void startDocumentHandler();
- void endDocumentHandler();
- bool tagHandler(const HtmlTag &tag);
- bool characterDataHandler(const char *text, std::size_t len, bool convert);
-
-private:
- void preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert);
- void addConvertedDataToBuffer(const char *text, std::size_t len, bool convert);
-
-protected:
- BookReader myBookReader;
- std::string myBaseDirPath;
-
-private:
- const PlainTextFormat &myFormat;
- int myIgnoreDataCounter;
- bool myIsPreformatted;
- bool myDontBreakParagraph;
-
- bool myIsStarted;
- bool myBuildTableOfContent;
- bool myProcessPreTag;
- bool myIgnoreTitles;
- std::stack<int> myListNumStack;
-
- StyleSheetTable myStyleSheetTable;
- shared_ptr<StyleSheetParser> myStyleSheetParser;
-
- int mySpaceCounter;
- int myBreakCounter;
- std::string myConverterBuffer;
-
- std::map<std::string,shared_ptr<HtmlTagAction> > myActionMap;
- std::vector<FBTextKind> myKindList;
-
- std::string myFileName;
-
- friend class HtmlTagAction;
- friend class HtmlControlTagAction;
- friend class HtmlHeaderTagAction;
- friend class HtmlIgnoreTagAction;
- friend class HtmlHrefTagAction;
- friend class HtmlImageTagAction;
- friend class HtmlBreakTagAction;
- friend class HtmlPreTagAction;
- friend class HtmlListTagAction;
- friend class HtmlListItemTagAction;
- friend class HtmlTableTagAction;
- friend class HtmlStyleTagAction;
-};
-
-#endif /* __HTMLBOOKREADER_H__ */
diff --git a/fbreader/src/formats/html/HtmlDescriptionReader.cpp b/fbreader/src/formats/html/HtmlDescriptionReader.cpp
deleted file mode 100644
index 6ebcb8b..0000000
--- a/fbreader/src/formats/html/HtmlDescriptionReader.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include "HtmlDescriptionReader.h"
-
-#include "../../library/Book.h"
-
-HtmlDescriptionReader::HtmlDescriptionReader(Book &book) : HtmlReader(book.encoding()), myBook(book) {
- myBook.setTitle("");
-}
-
-void HtmlDescriptionReader::startDocumentHandler() {
- myReadTitle = false;
-}
-
-void HtmlDescriptionReader::endDocumentHandler() {
- if (!myBook.title().empty()) {
- const char *titleStart = myBook.title().data();
- const char *titleEnd = titleStart + myBook.title().length();
- std::string newTitle;
- myConverter->convert(newTitle, titleStart, titleEnd);
- myBook.setTitle(newTitle);
- }
-}
-
-bool HtmlDescriptionReader::tagHandler(const HtmlTag &tag) {
- if (tag.Name == "TITLE") {
- if (myReadTitle && !tag.Start) {
- myBook.setTitle(myBuffer);
- myBuffer.erase();
- }
- myReadTitle = tag.Start && myBook.title().empty();
- return true;
- } else if (tag.Start && tag.Name == "META") {
- std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin();
- for (; it != tag.Attributes.end(); ++it) {
- if (it->Name == "CONTENT") {
- break;
- }
- }
- if (it != tag.Attributes.end()) {
- const std::string prefix = "charset=";
- std::size_t index = it->Value.find(prefix);
- if (index != std::string::npos) {
- std::string charset = it->Value.substr(index + prefix.length());
- index = charset.find(';');
- if (index != std::string::npos) {
- charset = charset.substr(0, index);
- }
- index = charset.find(' ');
- if (index != std::string::npos) {
- charset = charset.substr(0, index);
- }
- myBook.setEncoding(charset);
- }
- }
- }
- return tag.Name != "BODY";
-}
-
-bool HtmlDescriptionReader::characterDataHandler(const char *text, std::size_t len, bool) {
- if (myReadTitle) {
- myBuffer.append(text, len);
- }
- return true;
-}
diff --git a/fbreader/src/formats/html/HtmlDescriptionReader.h b/fbreader/src/formats/html/HtmlDescriptionReader.h
deleted file mode 100644
index 159d4b0..0000000
--- a/fbreader/src/formats/html/HtmlDescriptionReader.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLDESCRIPTIONREADER_H__
-#define __HTMLDESCRIPTIONREADER_H__
-
-#include "HtmlReader.h"
-
-class Book;
-
-class HtmlDescriptionReader : public HtmlReader {
-
-public:
- HtmlDescriptionReader(Book &book);
- ~HtmlDescriptionReader();
-
-protected:
- void startDocumentHandler();
- void endDocumentHandler();
-
- bool tagHandler(const HtmlTag &tag);
- bool characterDataHandler(const char *text, std::size_t len, bool convert);
-
-private:
- bool myReadTitle;
- std::string myBuffer;
- Book &myBook;
-};
-
-inline HtmlDescriptionReader::~HtmlDescriptionReader() {}
-
-#endif /* __HTMLDESCRIPTIONREADER_H__ */
diff --git a/fbreader/src/formats/html/HtmlEntityCollection.cpp b/fbreader/src/formats/html/HtmlEntityCollection.cpp
deleted file mode 100644
index bd1bb4e..0000000
--- a/fbreader/src/formats/html/HtmlEntityCollection.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <cstdlib>
-#include <cctype>
-
-#include <ZLibrary.h>
-#include <ZLFile.h>
-#include <ZLXMLReader.h>
-
-#include "HtmlEntityCollection.h"
-
-class CollectionReader : public ZLXMLReader {
-
-public:
- CollectionReader(std::map<std::string,int> &collection);
- void startElementHandler(const char *tag, const char **attributes);
-
-private:
- std::map<std::string,int> &myCollection;
-};
-
-std::map<std::string,int> HtmlEntityCollection::ourCollection;
-
-int HtmlEntityCollection::symbolNumber(const std::string &name) {
- if (ourCollection.empty()) {
- CollectionReader(ourCollection).readDocument(ZLFile(
- ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter +
- "formats" + ZLibrary::FileNameDelimiter +
- "html" + ZLibrary::FileNameDelimiter + "html.ent"
- ));
- }
- std::map<std::string,int>::const_iterator it = ourCollection.find(name);
- return it == ourCollection.end() ? 0 : it->second;
-}
-
-CollectionReader::CollectionReader(std::map<std::string,int> &collection) : myCollection(collection) {
-}
-
-void CollectionReader::startElementHandler(const char *tag, const char **attributes) {
- static const std::string ENTITY = "entity";
-
- if (ENTITY == tag) {
- for (int i = 0; i < 4; ++i) {
- if (attributes[i] == 0) {
- return;
- }
- }
- static const std::string _name = "name";
- static const std::string _number = "number";
- if (_name == attributes[0] && _number == attributes[2]) {
- myCollection[attributes[1]] = std::atoi(attributes[3]);
- }
- }
-}
diff --git a/fbreader/src/formats/html/HtmlEntityCollection.h b/fbreader/src/formats/html/HtmlEntityCollection.h
deleted file mode 100644
index 6f70491..0000000
--- a/fbreader/src/formats/html/HtmlEntityCollection.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLENTITYCOLLECTION_H__
-#define __HTMLENTITYCOLLECTION_H__
-
-#include <string>
-#include <map>
-
-class HtmlEntityCollection {
-
-public:
- static int symbolNumber(const std::string &name);
-
-private:
- static std::map<std::string,int> ourCollection;
-
-private:
- HtmlEntityCollection();
-};
-
-#endif /* __HTMLENTITYCOLLECTION_H__ */
diff --git a/fbreader/src/formats/html/HtmlPlugin.cpp b/fbreader/src/formats/html/HtmlPlugin.cpp
deleted file mode 100644
index 279e096..0000000
--- a/fbreader/src/formats/html/HtmlPlugin.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <ZLStringUtil.h>
-#include <ZLFile.h>
-#include <ZLInputStream.h>
-
-#include "HtmlPlugin.h"
-#include "HtmlDescriptionReader.h"
-#include "HtmlBookReader.h"
-#include "HtmlReaderStream.h"
-#include "../txt/PlainTextFormat.h"
-#include "../util/MiscUtil.h"
-#include "../../library/Book.h"
-#include "../../bookmodel/BookModel.h"
-
-bool HtmlPlugin::acceptsFile(const ZLFile &file) const {
- const std::string &extension = file.extension();
- return ZLStringUtil::stringEndsWith(extension, "html") || (extension == "htm");
-}
-
-bool HtmlPlugin::readMetaInfo(Book &book) const {
- shared_ptr<ZLInputStream> stream = book.file().inputStream();
- if (stream.isNull()) {
- return false;
- }
-
- shared_ptr<ZLInputStream> htmlStream = new HtmlReaderStream(stream, 50000);
- detectEncodingAndLanguage(book, *htmlStream);
- if (book.encoding().empty()) {
- return false;
- }
- HtmlDescriptionReader(book).readDocument(*stream);
-
- return true;
-}
-
-bool HtmlPlugin::readModel(BookModel &model) const {
- const Book& book = *model.book();
- const ZLFile &file = book.file();
- shared_ptr<ZLInputStream> stream = file.inputStream();
- if (stream.isNull()) {
- return false;
- }
-
- PlainTextFormat format(file);
- if (!format.initialized()) {
- PlainTextFormatDetector detector;
- detector.detect(*stream, format);
- }
-
- std::string directoryPrefix = MiscUtil::htmlDirectoryPrefix(file.path());
- HtmlBookReader reader(directoryPrefix, model, format, book.encoding());
- reader.setFileName(MiscUtil::htmlFileName(file.path()));
- reader.readDocument(*stream);
-
- return true;
-}
-
-FormatInfoPage *HtmlPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) {
- return new PlainTextInfoPage(dialog, file, ZLResourceKey("<PRE>"), false);
-}
-
-bool HtmlPlugin::readLanguageAndEncoding(Book &book) const {
- (void)book;
- return true;
-}
diff --git a/fbreader/src/formats/html/HtmlPlugin.h b/fbreader/src/formats/html/HtmlPlugin.h
deleted file mode 100644
index c66a108..0000000
--- a/fbreader/src/formats/html/HtmlPlugin.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLPLUGIN_H__
-#define __HTMLPLUGIN_H__
-
-#include "../FormatPlugin.h"
-
-class HtmlPlugin : public FormatPlugin {
-
-public:
- HtmlPlugin();
- ~HtmlPlugin();
- bool providesMetaInfo() const;
- bool acceptsFile(const ZLFile &file) const;
- bool readMetaInfo(Book &book) const;
- bool readLanguageAndEncoding(Book &book) const;
- bool readModel(BookModel &model) const;
- FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file);
-};
-
-inline HtmlPlugin::HtmlPlugin() {}
-inline HtmlPlugin::~HtmlPlugin() {}
-inline bool HtmlPlugin::providesMetaInfo() const { return false; }
-
-#endif /* __HTMLPLUGIN_H__ */
diff --git a/fbreader/src/formats/html/HtmlReader.cpp b/fbreader/src/formats/html/HtmlReader.cpp
deleted file mode 100644
index a5ce7fa..0000000
--- a/fbreader/src/formats/html/HtmlReader.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <algorithm>
-#include <cctype>
-
-#include <ZLInputStream.h>
-#include <ZLXMLReader.h>
-#include <ZLFile.h>
-#include <ZLStringUtil.h>
-#include <ZLUnicodeUtil.h>
-
-#include "HtmlReader.h"
-#include "HtmlEntityCollection.h"
-
-HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
-}
-
-HtmlReader::~HtmlReader() {
-}
-
-void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
- tag.Attributes.clear();
-
- if (name.length() == 0) {
- tag.Name = name;
- return;
- }
-
- tag.Start = name[0] != '/';
- if (tag.Start) {
- tag.Name = name;
- } else {
- tag.Name = name.substr(1);
- }
-
- const std::size_t len = tag.Name.length();
- for (std::size_t i = 0; i < len; ++i) {
- tag.Name[i] = std::toupper(tag.Name[i]);
- }
-}
-
-enum ParseState {
- PS_TEXT,
- PS_TAGSTART,
- PS_TAGNAME,
- PS_WAIT_END_OF_TAG,
- PS_ATTRIBUTENAME,
- PS_ATTRIBUTEVALUE,
- PS_SKIPTAG,
- PS_COMMENT,
- PS_SPECIAL,
- PS_SPECIAL_IN_ATTRIBUTEVALUE,
-};
-
-enum SpecialType {
- ST_UNKNOWN,
- ST_NUM,
- ST_NAME,
- ST_DEC,
- ST_HEX
-};
-
-static bool allowSymbol(SpecialType type, char ch) {
- return
- (type == ST_NAME && std::isalpha(ch)) ||
- (type == ST_DEC && std::isdigit(ch)) ||
- (type == ST_HEX && std::isxdigit(ch));
-}
-
-static int specialSymbolNumber(SpecialType type, const std::string &txt) {
- char *end = 0;
- switch (type) {
- case ST_NAME:
- return HtmlEntityCollection::symbolNumber(txt);
- case ST_DEC:
- return std::strtol(txt.c_str() + 1, &end, 10);
- case ST_HEX:
- return std::strtol(txt.c_str() + 2, &end, 16);
- default:
- return 0;
- }
-}
-
-void HtmlReader::appendString(std::string &to, std::string &from) {
- if (myConverter.isNull()) {
- to += from;
- } else {
- myConverter->convert(to, from);
- myConverter->reset();
- }
- from.erase();
-}
-
-void HtmlReader::readDocument(ZLInputStream &stream) {
- if (!stream.open()) {
- return;
- }
-
- startDocumentHandler();
-
- ParseState state = PS_TEXT;
- SpecialType state_special = ST_UNKNOWN;
- std::string currentString;
- std::string attributeValueString;
- std::string specialString;
- int quotationCounter = 0;
- HtmlTag currentTag;
- char endOfComment[2] = "\0";
-
- const std::size_t BUFSIZE = 2048;
- char *buffer = new char[BUFSIZE];
- std::size_t length;
- std::size_t offset = 0;
- do {
- length = stream.read(buffer, BUFSIZE);
- char *start = buffer;
- char *endOfBuffer = buffer + length;
- for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
- switch (state) {
- case PS_TEXT:
- if (*ptr == '<') {
- if (!characterDataHandler(start, ptr - start, true)) {
- goto endOfProcessing;
- }
- start = ptr + 1;
- state = PS_TAGSTART;
- currentTag.Offset = offset + (ptr - buffer);
- }
- if (*ptr == '&') {
- if (!characterDataHandler(start, ptr - start, true)) {
- goto endOfProcessing;
- }
- start = ptr + 1;
- state = PS_SPECIAL;
- state_special = ST_UNKNOWN;
- }
- break;
- case PS_SPECIAL:
- case PS_SPECIAL_IN_ATTRIBUTEVALUE:
- if (state_special == ST_UNKNOWN) {
- if (*ptr == '#') {
- state_special = ST_NUM;
- } else if (std::isalpha(*ptr)) {
- state_special = ST_NAME;
- } else {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- } else if (state_special == ST_NUM) {
- if (*ptr == 'x') {
- state_special = ST_HEX;
- } else if (std::isdigit(*ptr)) {
- state_special = ST_DEC;
- } else {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- } else {
- if (*ptr == ';') {
- specialString.append(start, ptr - start);
- int number = specialSymbolNumber(state_special, specialString);
- if ((128 <= number) && (number <= 159)) {
- char ch = number;
- if (state == PS_SPECIAL) {
- characterDataHandler(&ch, 1, true);
- } else {
- myConverter->convert(attributeValueString, &ch, &ch + 1);
- }
- } else if (number != 0) {
- char buffer[4];
- int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
- if (state == PS_SPECIAL) {
- characterDataHandler(buffer, len, false);
- } else {
- attributeValueString.append(buffer, len);
- }
- } else {
- specialString = "&" + specialString + ";";
- if (state == PS_SPECIAL) {
- characterDataHandler(specialString.c_str(), specialString.length(), false);
- } else {
- attributeValueString += specialString;
- }
- }
- specialString.erase();
- start = ptr + 1;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- } else if (!allowSymbol(state_special, *ptr)) {
- start = ptr;
- state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
- }
- }
- break;
- case PS_TAGSTART:
- state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
- break;
- case PS_COMMENT:
- if ((endOfComment[0] == '\0') && (*ptr != '-')) {
- state = PS_TAGNAME;
- } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
- start = ptr + 1;
- state = PS_TEXT;
- endOfComment[0] = '\0';
- endOfComment[1] = '\0';
- } else {
- endOfComment[0] = endOfComment[1];
- endOfComment[1] = *ptr;
- }
- break;
- case PS_WAIT_END_OF_TAG:
- if (*ptr == '>') {
- start = ptr + 1;
- state = PS_TEXT;
- }
- break;
- case PS_TAGNAME:
- if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
- currentString.append(start, ptr - start);
- start = ptr + 1;
- setTag(currentTag, currentString);
- currentString.erase();
- if (currentTag.Name == "") {
- state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
- } else {
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = PS_ATTRIBUTENAME;
- }
- }
- }
- break;
- case PS_ATTRIBUTENAME:
- if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
- if (ptr != start || !currentString.empty()) {
- currentString.append(start, ptr - start);
- for (unsigned int i = 0; i < currentString.length(); ++i) {
- currentString[i] = std::toupper(currentString[i]);
- }
- currentTag.addAttribute(currentString);
- currentString.erase();
- }
- start = ptr + 1;
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
- }
- }
- break;
- case PS_ATTRIBUTEVALUE:
- if (*ptr == '"') {
- if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
- ++quotationCounter;
- }
- } else if (*ptr == '&') {
- currentString.append(start, ptr - start);
- start = ptr + 1;
- appendString(attributeValueString, currentString);
- state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
- state_special = ST_UNKNOWN;
- } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
- if (ptr != start || !currentString.empty()) {
- currentString.append(start, ptr - start);
- appendString(attributeValueString, currentString);
- if (attributeValueString[0] == '"') {
- attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
- }
- currentTag.setLastAttributeValue(attributeValueString);
- attributeValueString.erase();
- quotationCounter = 0;
- }
- start = ptr + 1;
- if (*ptr == '>') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_TEXT;
- } else if (*ptr == '/') {
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- currentTag.Start = false;
- if (!tagHandler(currentTag)) {
- goto endOfProcessing;
- }
- state = PS_WAIT_END_OF_TAG;
- } else {
- state = PS_ATTRIBUTENAME;
- }
- }
- break;
- case PS_SKIPTAG:
- if (*ptr == '>') {
- start = ptr + 1;
- state = PS_TEXT;
- }
- break;
- }
- }
- if (start != endOfBuffer) {
- switch (state) {
- case PS_TEXT:
- if (!characterDataHandler(start, endOfBuffer - start, true)) {
- goto endOfProcessing;
- }
- break;
- case PS_TAGNAME:
- case PS_ATTRIBUTENAME:
- case PS_ATTRIBUTEVALUE:
- currentString.append(start, endOfBuffer - start);
- break;
- case PS_SPECIAL:
- case PS_SPECIAL_IN_ATTRIBUTEVALUE:
- specialString.append(start, endOfBuffer - start);
- break;
- case PS_TAGSTART:
- case PS_SKIPTAG:
- case PS_COMMENT:
- case PS_WAIT_END_OF_TAG:
- break;
- }
- }
- offset += length;
- } while (length == BUFSIZE);
-endOfProcessing:
- delete[] buffer;
-
- endDocumentHandler();
-
- stream.close();
-}
diff --git a/fbreader/src/formats/html/HtmlReader.h b/fbreader/src/formats/html/HtmlReader.h
deleted file mode 100644
index 876fad8..0000000
--- a/fbreader/src/formats/html/HtmlReader.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLREADER_H__
-#define __HTMLREADER_H__
-
-#include <string>
-#include <vector>
-
-#include <ZLEncodingConverter.h>
-#include "../EncodedTextReader.h"
-
-class ZLInputStream;
-
-class HtmlReader : public EncodedTextReader {
-
-public:
- struct HtmlAttribute {
- std::string Name;
- std::string Value;
- bool HasValue;
-
- HtmlAttribute(const std::string &name);
- ~HtmlAttribute();
- void setValue(const std::string &value);
- };
-
- struct HtmlTag {
- std::string Name;
- std::size_t Offset;
- bool Start;
- std::vector<HtmlAttribute> Attributes;
-
- HtmlTag();
- ~HtmlTag();
- void addAttribute(const std::string &name);
- void setLastAttributeValue(const std::string &value);
-
- private:
- HtmlTag(const HtmlTag&);
- const HtmlTag &operator = (const HtmlTag&);
- };
-
-private:
- static void setTag(HtmlTag &tag, const std::string &fullName);
-
-public:
- virtual void readDocument(ZLInputStream &stream);
-
-protected:
- HtmlReader(const std::string &encoding);
- virtual ~HtmlReader();
-
-protected:
- virtual void startDocumentHandler() = 0;
- virtual void endDocumentHandler() = 0;
-
- // returns false iff processing must be stopped
- virtual bool tagHandler(const HtmlTag &tag) = 0;
- // returns false iff processing must be stopped
- virtual bool characterDataHandler(const char *text, std::size_t len, bool convert) = 0;
-
-private:
- void appendString(std::string &to, std::string &from);
-};
-
-inline HtmlReader::HtmlAttribute::HtmlAttribute(const std::string &name) : Name(name), HasValue(false) {}
-inline HtmlReader::HtmlAttribute::~HtmlAttribute() {}
-inline void HtmlReader::HtmlAttribute::setValue(const std::string &value) { Value = value; HasValue = true; }
-
-inline HtmlReader::HtmlTag::HtmlTag() : Start(true) {}
-inline HtmlReader::HtmlTag::~HtmlTag() {}
-inline void HtmlReader::HtmlTag::addAttribute(const std::string &name) { Attributes.push_back(HtmlAttribute(name)); }
-inline void HtmlReader::HtmlTag::setLastAttributeValue(const std::string &value) { if (!Attributes.empty()) Attributes.back().setValue(value); }
-
-#endif /* __HTMLREADER_H__ */
diff --git a/fbreader/src/formats/html/HtmlReaderStream.cpp b/fbreader/src/formats/html/HtmlReaderStream.cpp
deleted file mode 100644
index 08c43ae..0000000
--- a/fbreader/src/formats/html/HtmlReaderStream.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2008-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#include <cstdlib>
-#include <cstring>
-#include <algorithm>
-
-#include "HtmlReaderStream.h"
-#include "HtmlReader.h"
-
-class HtmlTextOnlyReader : public HtmlReader {
-
-public:
- HtmlTextOnlyReader(char *buffer, std::size_t maxSize);
- std::size_t size() const;
-
-private:
- void startDocumentHandler();
- void endDocumentHandler();
-
- bool tagHandler(const HtmlTag &tag);
- bool characterDataHandler(const char *text, std::size_t len, bool convert);
-
-private:
- char *myBuffer;
- std::size_t myMaxSize;
- std::size_t myFilledSize;
- bool myIgnoreText;
-};
-
-HtmlTextOnlyReader::HtmlTextOnlyReader(char *buffer, std::size_t maxSize) : HtmlReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0), myIgnoreText(false) {
-}
-
-std::size_t HtmlTextOnlyReader::size() const {
- return myFilledSize;
-}
-
-void HtmlTextOnlyReader::startDocumentHandler() {
-}
-
-void HtmlTextOnlyReader::endDocumentHandler() {
-}
-
-bool HtmlTextOnlyReader::tagHandler(const HtmlTag &tag) {
- if (tag.Name == "SCRIPT") {
- myIgnoreText = tag.Start;
- }
- if ((myFilledSize < myMaxSize) && (myFilledSize > 0) && (myBuffer[myFilledSize - 1] != '\n')) {
- myBuffer[myFilledSize++] = '\n';
- }
- return myFilledSize < myMaxSize;
-}
-
-bool HtmlTextOnlyReader::characterDataHandler(const char *text, std::size_t len, bool) {
- if (!myIgnoreText) {
- len = std::min((std::size_t)len, myMaxSize - myFilledSize);
- std::memcpy(myBuffer + myFilledSize, text, len);
- myFilledSize += len;
- }
- return myFilledSize < myMaxSize;
-}
-
-HtmlReaderStream::HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize) : myBase(base), myBuffer(0), mySize(maxSize) {
-}
-
-HtmlReaderStream::~HtmlReaderStream() {
- close();
-}
-
-bool HtmlReaderStream::open() {
- if (myBase.isNull() || !myBase->open()) {
- return false;
- }
- myBuffer = new char[mySize];
- HtmlTextOnlyReader reader(myBuffer, mySize);
- reader.readDocument(*myBase);
- mySize = reader.size();
- myOffset = 0;
- myBase->close();
- return true;
-}
-
-std::size_t HtmlReaderStream::read(char *buffer, std::size_t maxSize) {
- maxSize = std::min(maxSize, mySize - myOffset);
- if (buffer != 0) {
- std::memcpy(buffer, myBuffer, maxSize);
- }
- myOffset += maxSize;
- return maxSize;
-}
-
-void HtmlReaderStream::close() {
- if (myBuffer != 0) {
- delete[] myBuffer;
- myBuffer = 0;
- }
-}
-
-void HtmlReaderStream::seek(int offset, bool absoluteOffset) {
- if (!absoluteOffset) {
- offset += myOffset;
- }
- myOffset = std::min(mySize, (std::size_t)std::max(0, offset));
-}
-
-std::size_t HtmlReaderStream::offset() const {
- return myOffset;
-}
-
-std::size_t HtmlReaderStream::sizeOfOpened() {
- return mySize;
-}
diff --git a/fbreader/src/formats/html/HtmlReaderStream.h b/fbreader/src/formats/html/HtmlReaderStream.h
deleted file mode 100644
index c5c15b8..0000000
--- a/fbreader/src/formats/html/HtmlReaderStream.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2008-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLREADERSTREAM_H__
-#define __HTMLREADERSTREAM_H__
-
-#include <shared_ptr.h>
-#include <ZLInputStream.h>
-
-class HtmlReaderStream : public ZLInputStream {
-
-public:
- HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize);
- ~HtmlReaderStream();
-
-private:
- bool open();
- std::size_t read(char *buffer, std::size_t maxSize);
- void close();
-
- void seek(int offset, bool absoluteOffset);
- std::size_t offset() const;
- std::size_t sizeOfOpened();
-
-private:
- shared_ptr<ZLInputStream> myBase;
- char *myBuffer;
- std::size_t mySize;
- std::size_t myOffset;
-};
-
-#endif /* __HTMLREADERSTREAM_H__ */
diff --git a/fbreader/src/formats/html/HtmlTagActions.h b/fbreader/src/formats/html/HtmlTagActions.h
deleted file mode 100644
index 7da3f20..0000000
--- a/fbreader/src/formats/html/HtmlTagActions.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- */
-
-#ifndef __HTMLTAGACTIONS_H__
-#define __HTMLTAGACTIONS_H__
-
-#include <set>
-
-#include "HtmlBookReader.h"
-
-class HtmlTagAction {
-
-protected:
- HtmlTagAction(HtmlBookReader &reader);
-
-public:
- virtual ~HtmlTagAction();
- virtual void run(const HtmlReader::HtmlTag &tag) = 0;
- virtual void reset();
-
-protected:
- BookReader &bookReader();
-
-protected:
- HtmlBookReader &myReader;
-};
-
-class DummyHtmlTagAction : public HtmlTagAction {
-
-public:
- DummyHtmlTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-class HtmlControlTagAction : public HtmlTagAction {
-
-public:
- HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind);
- void run(const HtmlReader::HtmlTag &tag);
-
-private:
- FBTextKind myKind;
-};
-
-class HtmlHeaderTagAction : public HtmlTagAction {
-
-public:
- HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind);
- void run(const HtmlReader::HtmlTag &tag);
-
-private:
- FBTextKind myKind;
-};
-
-class HtmlIgnoreTagAction : public HtmlTagAction {
-
-public:
- HtmlIgnoreTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-
-private:
- std::set<std::string> myTagNames;
-};
-
-class HtmlHrefTagAction : public HtmlTagAction {
-
-public:
- HtmlHrefTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
- void reset();
-
-protected:
- FBTextKind hyperlinkType() const;
- void setHyperlinkType(FBTextKind hyperlinkType);
-
-private:
- FBTextKind myHyperlinkType;
-};
-
-class HtmlImageTagAction : public HtmlTagAction {
-
-public:
- HtmlImageTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-class HtmlBreakTagAction : public HtmlTagAction {
-
-public:
- enum BreakType {
- BREAK_AT_START = 1,
- BREAK_AT_END = 2,
- BREAK_AT_START_AND_AT_END = BREAK_AT_START | BREAK_AT_END
- };
- HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType);
- void run(const HtmlReader::HtmlTag &tag);
-
-private:
- BreakType myBreakType;
-};
-
-class HtmlPreTagAction : public HtmlTagAction {
-
-public:
- HtmlPreTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-class HtmlListTagAction : public HtmlTagAction {
-
-public:
- HtmlListTagAction(HtmlBookReader &reader, int startIndex);
- void run(const HtmlReader::HtmlTag &tag);
-
-private:
- int myStartIndex;
-};
-
-class HtmlListItemTagAction : public HtmlTagAction {
-
-public:
- HtmlListItemTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-class HtmlTableTagAction : public HtmlTagAction {
-
-public:
- HtmlTableTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-class HtmlStyleTagAction : public HtmlTagAction {
-
-public:
- HtmlStyleTagAction(HtmlBookReader &reader);
- void run(const HtmlReader::HtmlTag &tag);
-};
-
-inline BookReader &HtmlTagAction::bookReader() { return myReader.myBookReader; }
-
-#endif /* __HTMLTAGACTIONS_H__ */