summaryrefslogtreecommitdiffstats
path: root/reader/src/formats/html
diff options
context:
space:
mode:
Diffstat (limited to 'reader/src/formats/html')
-rw-r--r--reader/src/formats/html/HtmlBookReader.cpp583
-rw-r--r--reader/src/formats/html/HtmlBookReader.h101
-rw-r--r--reader/src/formats/html/HtmlDescriptionReader.cpp82
-rw-r--r--reader/src/formats/html/HtmlDescriptionReader.h48
-rw-r--r--reader/src/formats/html/HtmlEntityCollection.cpp71
-rw-r--r--reader/src/formats/html/HtmlEntityCollection.h38
-rw-r--r--reader/src/formats/html/HtmlPlugin.cpp83
-rw-r--r--reader/src/formats/html/HtmlPlugin.h42
-rw-r--r--reader/src/formats/html/HtmlReader.cpp373
-rw-r--r--reader/src/formats/html/HtmlReader.h92
-rw-r--r--reader/src/formats/html/HtmlReaderStream.cpp128
-rw-r--r--reader/src/formats/html/HtmlReaderStream.h48
-rw-r--r--reader/src/formats/html/HtmlTagActions.h158
13 files changed, 1847 insertions, 0 deletions
diff --git a/reader/src/formats/html/HtmlBookReader.cpp b/reader/src/formats/html/HtmlBookReader.cpp
new file mode 100644
index 0000000..321913d
--- /dev/null
+++ b/reader/src/formats/html/HtmlBookReader.cpp
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <cctype>
+
+#include <ZLFile.h>
+#include <ZLFileImage.h>
+#include <ZLStringUtil.h>
+
+#include "HtmlBookReader.h"
+#include "HtmlTagActions.h"
+#include "../txt/PlainTextFormat.h"
+#include "../util/MiscUtil.h"
+#include "../../bookmodel/BookModel.h"
+#include "../css/StyleSheetParser.h"
+
+HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) {
+}
+
+HtmlTagAction::~HtmlTagAction() {
+}
+
+void HtmlTagAction::reset() {
+}
+
+DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) {
+}
+
+HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
+}
+
+void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) {
+ std::vector<FBTextKind> &list = myReader.myKindList;
+ int index;
+ for (index = list.size() - 1; index >= 0; --index) {
+ if (list[index] == myKind) {
+ break;
+ }
+ }
+ if (tag.Start) {
+ if (index == -1) {
+ bookReader().pushKind(myKind);
+ myReader.myKindList.push_back(myKind);
+ bookReader().addControl(myKind, true);
+ }
+ } else {
+ if (index >= 0) {
+ for (int i = list.size() - 1; i >= index; --i) {
+ bookReader().addControl(list[i], false);
+ bookReader().popKind();
+ }
+ for (unsigned int j = index + 1; j < list.size(); ++j) {
+ bookReader().addControl(list[j], true);
+ bookReader().pushKind(list[j]);
+ }
+ list.erase(list.begin() + index);
+ }
+ }
+}
+
+HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) {
+}
+
+void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) {
+ myReader.myIsStarted = false;
+ if (tag.Start) {
+ if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
+ if (!bookReader().contentsParagraphIsOpen()) {
+ bookReader().insertEndOfSectionParagraph();
+ bookReader().enterTitle();
+ bookReader().beginContentsParagraph();
+ }
+ }
+ bookReader().pushKind(myKind);
+ } else {
+ bookReader().popKind();
+ if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) {
+ bookReader().endContentsParagraph();
+ bookReader().exitTitle();
+ }
+ }
+ bookReader().beginParagraph();
+}
+
+HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ if (myTagNames.find(tag.Name) == myTagNames.end()) {
+ ++myReader.myIgnoreDataCounter;
+ myTagNames.insert(tag.Name);
+ }
+ } else {
+ if (myTagNames.find(tag.Name) != myTagNames.end()) {
+ --myReader.myIgnoreDataCounter;
+ myTagNames.erase(tag.Name);
+ }
+ }
+}
+
+HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
+ if (tag.Attributes[i].Name == "NAME") {
+ bookReader().addHyperlinkLabel(tag.Attributes[i].Value);
+ } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) {
+ std::string value = tag.Attributes[i].Value;
+ if (!myReader.myFileName.empty() &&
+ (value.length() > myReader.myFileName.length()) &&
+ (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) {
+ value = value.substr(myReader.myFileName.length());
+ }
+ if (!value.empty()) {
+ if (value[0] == '#') {
+ setHyperlinkType(INTERNAL_HYPERLINK);
+ bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1));
+ } else {
+ FBTextKind hyperlinkType = MiscUtil::referenceType(value);
+ if (hyperlinkType != INTERNAL_HYPERLINK) {
+ setHyperlinkType(hyperlinkType);
+ bookReader().addHyperlinkControl(hyperlinkType, value);
+ }
+ }
+ }
+ }
+ }
+ } else if (hyperlinkType() != REGULAR) {
+ bookReader().addControl(hyperlinkType(), false);
+ setHyperlinkType(REGULAR);
+ }
+}
+
+void HtmlHrefTagAction::reset() {
+ setHyperlinkType(REGULAR);
+}
+
+FBTextKind HtmlHrefTagAction::hyperlinkType() const {
+ return myHyperlinkType;
+}
+
+void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) {
+ myHyperlinkType = hyperlinkType;
+}
+
+HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ bookReader().endParagraph();
+ for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
+ if (tag.Attributes[i].Name == "SRC") {
+ const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value);
+ const ZLFile file(myReader.myBaseDirPath + fileName);
+ if (file.exists()) {
+ bookReader().addImageReference(fileName);
+ bookReader().addImage(fileName, new ZLFileImage(file, 0));
+ }
+ break;
+ }
+ }
+ bookReader().beginParagraph();
+ }
+}
+
+HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) {
+}
+
+void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (myReader.myDontBreakParagraph) {
+ myReader.myDontBreakParagraph = false;
+ return;
+ }
+
+ if ((tag.Start && (myBreakType & BREAK_AT_START)) ||
+ (!tag.Start && (myBreakType & BREAK_AT_END))) {
+ bookReader().endParagraph();
+ if (bookReader().isKindStackEmpty()) {
+ bookReader().pushKind(REGULAR);
+ }
+ bookReader().beginParagraph();
+ }
+}
+
+HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) {
+ bookReader().endParagraph();
+ myReader.myIsPreformatted = tag.Start;
+ myReader.mySpaceCounter = -1;
+ myReader.myBreakCounter = 0;
+ if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
+ if (tag.Start) {
+ bookReader().pushKind(PREFORMATTED);
+ } else {
+ bookReader().popKind();
+ }
+ }
+ bookReader().beginParagraph();
+}
+
+HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) {
+}
+
+void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ myReader.myListNumStack.push(myStartIndex);
+ } else if (!myReader.myListNumStack.empty()) {
+ myReader.myListNumStack.pop();
+ }
+}
+
+HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ bookReader().endParagraph();
+ bookReader().beginParagraph();
+ if (!myReader.myListNumStack.empty()) {
+ bookReader().addFixedHSpace(3 * myReader.myListNumStack.size());
+ int &index = myReader.myListNumStack.top();
+ if (index == 0) {
+ myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false);
+ } else {
+ std::string number;
+ ZLStringUtil::appendNumber(number, index++);
+ number += ". ";
+ myReader.addConvertedDataToBuffer(number.data(), number.length(), false);
+ }
+ myReader.myDontBreakParagraph = true;
+ }
+ } else {
+ myReader.myDontBreakParagraph = false;
+ }
+}
+
+HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) {
+ if (tag.Start) {
+ myReader.myIgnoreTitles = true;
+ } else {
+ myReader.myIgnoreTitles = false;
+ }
+}
+
+HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) {
+}
+
+void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) {
+ myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0;
+ /*
+ if (!tag.Start) {
+ myReader.myStyleSheetTable.dump();
+ }
+ */
+}
+
+shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) {
+ if (tag == "EM") {
+ return new HtmlControlTagAction(*this, EMPHASIS);
+ } else if (tag == "STRONG") {
+ return new HtmlControlTagAction(*this, STRONG);
+ } else if (tag == "B") {
+ return new HtmlControlTagAction(*this, BOLD);
+ } else if (tag == "I") {
+ return new HtmlControlTagAction(*this, ITALIC);
+ } else if (tag == "TT") {
+ return new HtmlControlTagAction(*this, CODE);
+ } else if (tag == "CODE") {
+ return new HtmlControlTagAction(*this, CODE);
+ } else if (tag == "CITE") {
+ return new HtmlControlTagAction(*this, CITE);
+ } else if (tag == "SUB") {
+ return new HtmlControlTagAction(*this, SUB);
+ } else if (tag == "SUP") {
+ return new HtmlControlTagAction(*this, SUP);
+ } else if (tag == "H1") {
+ return new HtmlHeaderTagAction(*this, H1);
+ } else if (tag == "H2") {
+ return new HtmlHeaderTagAction(*this, H2);
+ } else if (tag == "H3") {
+ return new HtmlHeaderTagAction(*this, H3);
+ } else if (tag == "H4") {
+ return new HtmlHeaderTagAction(*this, H4);
+ } else if (tag == "H5") {
+ return new HtmlHeaderTagAction(*this, H5);
+ } else if (tag == "H6") {
+ return new HtmlHeaderTagAction(*this, H6);
+ } else if (tag == "HEAD") {
+ return new HtmlIgnoreTagAction(*this);
+ } else if (tag == "TITLE") {
+ return new HtmlIgnoreTagAction(*this);
+ } else if (tag == "STYLE") {
+ return new HtmlStyleTagAction(*this);
+ } else if (tag == "SELECT") {
+ return new HtmlIgnoreTagAction(*this);
+ } else if (tag == "SCRIPT") {
+ return new HtmlIgnoreTagAction(*this);
+ } else if (tag == "A") {
+ return new HtmlHrefTagAction(*this);
+ } else if (tag == "TD") {
+ //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
+ } else if (tag == "TR") {
+ return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
+ } else if (tag == "DIV") {
+ return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END);
+ } else if (tag == "DT") {
+ return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START);
+ } else if (tag == "P") {
+ return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
+ } else if (tag == "BR") {
+ return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END);
+ } else if (tag == "IMG") {
+ return new HtmlImageTagAction(*this);
+ } else if (tag == "UL") {
+ return new HtmlListTagAction(*this, 0);
+ } else if (tag == "MENU") {
+ return new HtmlListTagAction(*this, 0);
+ } else if (tag == "DIR") {
+ return new HtmlListTagAction(*this, 0);
+ } else if (tag == "OL") {
+ return new HtmlListTagAction(*this, 1);
+ } else if (tag == "LI") {
+ return new HtmlListItemTagAction(*this);
+ } else if (tag == "PRE") {
+ if (myProcessPreTag) {
+ return new HtmlPreTagAction(*this);
+ }
+ } else if (tag == "TABLE") {
+ return new HtmlTableTagAction(*this);
+ }
+ /*
+ } else if (tag == "DD") {
+ return 0;
+ } else if (tag == "DL") {
+ return 0;
+ } else if (tag == "DFN") {
+ return 0;
+ } else if (tag == "SAMP") {
+ return 0;
+ } else if (tag == "KBD") {
+ return 0;
+ } else if (tag == "VAR") {
+ return 0;
+ } else if (tag == "ABBR") {
+ return 0;
+ } else if (tag == "ACRONYM") {
+ return 0;
+ } else if (tag == "BLOCKQUOTE") {
+ return 0;
+ } else if (tag == "Q") {
+ return 0;
+ } else if (tag == "INS") {
+ return 0;
+ } else if (tag == "DEL") {
+ return 0;
+ } else if (tag == "BODY") {
+ return 0;
+ */
+ return new DummyHtmlTagAction(*this);
+}
+
+void HtmlBookReader::setBuildTableOfContent(bool build) {
+ myBuildTableOfContent = build;
+}
+
+void HtmlBookReader::setProcessPreTag(bool process) {
+ myProcessPreTag = process;
+}
+
+HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) {
+}
+
+HtmlBookReader::~HtmlBookReader() {
+}
+
+void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) {
+ if (len > 0) {
+ if (myDontBreakParagraph) {
+ while (len > 0 && std::isspace(*text)) {
+ --len;
+ ++text;
+ }
+ if (len == 0) {
+ return;
+ }
+ }
+ if (convert) {
+ myConverter->convert(myConverterBuffer, text, text + len);
+ myBookReader.addData(myConverterBuffer);
+ myBookReader.addContentsData(myConverterBuffer);
+ myConverterBuffer.erase();
+ } else {
+ std::string strText(text, len);
+ myBookReader.addData(strText);
+ myBookReader.addContentsData(strText);
+ }
+ myDontBreakParagraph = false;
+ }
+}
+
+bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
+ myConverter->reset();
+
+ for (unsigned int i = 0; i < tag.Attributes.size(); ++i) {
+ if (tag.Attributes[i].Name == "ID") {
+ myBookReader.addHyperlinkLabel(tag.Attributes[i].Value);
+ break;
+ }
+ }
+ shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
+ if (action.isNull()) {
+ action = createAction(tag.Name);
+ myActionMap[tag.Name] = action;
+ }
+ action->run(tag);
+
+ return true;
+}
+
+void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) {
+ const char *start = text;
+ const char *end = text + len;
+
+ int breakType = myFormat.breakType();
+ if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) {
+ for (const char *ptr = text; ptr != end; ++ptr) {
+ if (*ptr == '\n') {
+ mySpaceCounter = 0;
+ if (start < ptr) {
+ addConvertedDataToBuffer(start, ptr - start, convert);
+ } else {
+ static const std::string SPACE = " ";
+ myBookReader.addData(SPACE);
+ }
+ myBookReader.endParagraph();
+ myBookReader.beginParagraph();
+ start = ptr + 1;
+ } else if (mySpaceCounter >= 0) {
+ if (std::isspace((unsigned char)*ptr)) {
+ ++mySpaceCounter;
+ } else {
+ myBookReader.addFixedHSpace(mySpaceCounter);
+ mySpaceCounter = -1;
+ }
+ }
+ }
+ addConvertedDataToBuffer(start, end - start, convert);
+ } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) {
+ for (const char *ptr = text; ptr != end; ++ptr) {
+ if (std::isspace((unsigned char)*ptr)) {
+ if (*ptr == '\n') {
+ mySpaceCounter = 0;
+ } else if (mySpaceCounter >= 0) {
+ ++mySpaceCounter;
+ }
+ } else {
+ if (mySpaceCounter > myFormat.ignoredIndent()) {
+ if (ptr - start > mySpaceCounter) {
+ addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert);
+ myBookReader.endParagraph();
+ myBookReader.beginParagraph();
+ }
+ start = ptr;
+ }
+ mySpaceCounter = -1;
+ }
+ }
+ mySpaceCounter = std::max(mySpaceCounter, 0);
+ if (end - start > mySpaceCounter) {
+ addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert);
+ }
+ } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) {
+ for (const char *ptr = start; ptr != end; ++ptr) {
+ if (std::isspace((unsigned char)*ptr)) {
+ if (*ptr == '\n') {
+ ++myBreakCounter;
+ }
+ } else {
+ if (myBreakCounter > 1) {
+ addConvertedDataToBuffer(start, ptr - start, convert);
+ myBookReader.endParagraph();
+ myBookReader.beginParagraph();
+ start = ptr;
+ }
+ myBreakCounter = 0;
+ }
+ }
+ addConvertedDataToBuffer(start, end - start, convert);
+ }
+}
+
+bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) {
+ if (!myStyleSheetParser.isNull()) {
+ myStyleSheetParser->parse(text, len);
+ return true;
+ }
+
+ if (myIgnoreDataCounter != 0) {
+ return true;
+ }
+
+ if (myIsPreformatted) {
+ preformattedCharacterDataHandler(text, len, convert);
+ return true;
+ }
+
+ const char *ptr = text;
+ const char *end = text + len;
+ if (!myIsStarted) {
+ for (; ptr != end; ++ptr) {
+ if (!std::isspace((unsigned char)*ptr)) {
+ myIsStarted = true;
+ break;
+ }
+ }
+ }
+ if (myIsStarted) {
+ addConvertedDataToBuffer(ptr, end - ptr, convert);
+ }
+ return true;
+}
+
+void HtmlBookReader::startDocumentHandler() {
+ while (!myListNumStack.empty()) {
+ myListNumStack.pop();
+ }
+ myConverterBuffer.erase();
+ myKindList.clear();
+
+ myBookReader.reset();
+ myBookReader.setMainTextModel();
+ myBookReader.pushKind(REGULAR);
+ myBookReader.beginParagraph();
+ myIgnoreDataCounter = 0;
+ myIsPreformatted = false;
+ myDontBreakParagraph = false;
+ for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) {
+ it->second->reset();
+ }
+ myIsStarted = false;
+ myIgnoreTitles = false;
+
+ myStyleSheetParser = 0;
+
+ mySpaceCounter = -1;
+ myBreakCounter = 0;
+}
+
+void HtmlBookReader::endDocumentHandler() {
+ myBookReader.endParagraph();
+}
+
+void HtmlBookReader::setFileName(const std::string fileName) {
+ myFileName = fileName;
+}
diff --git a/reader/src/formats/html/HtmlBookReader.h b/reader/src/formats/html/HtmlBookReader.h
new file mode 100644
index 0000000..c8d4e32
--- /dev/null
+++ b/reader/src/formats/html/HtmlBookReader.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLBOOKREADER_H__
+#define __HTMLBOOKREADER_H__
+
+#include <stack>
+
+#include <shared_ptr.h>
+
+#include "HtmlReader.h"
+#include "../../bookmodel/BookReader.h"
+#include "../css/StyleSheetTable.h"
+
+class BookModel;
+class PlainTextFormat;
+class StyleSheetParser;
+
+class HtmlTagAction;
+
+class HtmlBookReader : public HtmlReader {
+
+public:
+ HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding);
+ ~HtmlBookReader();
+ void setFileName(const std::string fileName);
+
+protected:
+ virtual shared_ptr<HtmlTagAction> createAction(const std::string &tag);
+ void setBuildTableOfContent(bool build);
+ void setProcessPreTag(bool process);
+
+protected:
+ void startDocumentHandler();
+ void endDocumentHandler();
+ bool tagHandler(const HtmlTag &tag);
+ bool characterDataHandler(const char *text, std::size_t len, bool convert);
+
+private:
+ void preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert);
+ void addConvertedDataToBuffer(const char *text, std::size_t len, bool convert);
+
+protected:
+ BookReader myBookReader;
+ std::string myBaseDirPath;
+
+private:
+ const PlainTextFormat &myFormat;
+ int myIgnoreDataCounter;
+ bool myIsPreformatted;
+ bool myDontBreakParagraph;
+
+ bool myIsStarted;
+ bool myBuildTableOfContent;
+ bool myProcessPreTag;
+ bool myIgnoreTitles;
+ std::stack<int> myListNumStack;
+
+ StyleSheetTable myStyleSheetTable;
+ shared_ptr<StyleSheetParser> myStyleSheetParser;
+
+ int mySpaceCounter;
+ int myBreakCounter;
+ std::string myConverterBuffer;
+
+ std::map<std::string,shared_ptr<HtmlTagAction> > myActionMap;
+ std::vector<FBTextKind> myKindList;
+
+ std::string myFileName;
+
+ friend class HtmlTagAction;
+ friend class HtmlControlTagAction;
+ friend class HtmlHeaderTagAction;
+ friend class HtmlIgnoreTagAction;
+ friend class HtmlHrefTagAction;
+ friend class HtmlImageTagAction;
+ friend class HtmlBreakTagAction;
+ friend class HtmlPreTagAction;
+ friend class HtmlListTagAction;
+ friend class HtmlListItemTagAction;
+ friend class HtmlTableTagAction;
+ friend class HtmlStyleTagAction;
+};
+
+#endif /* __HTMLBOOKREADER_H__ */
diff --git a/reader/src/formats/html/HtmlDescriptionReader.cpp b/reader/src/formats/html/HtmlDescriptionReader.cpp
new file mode 100644
index 0000000..6ebcb8b
--- /dev/null
+++ b/reader/src/formats/html/HtmlDescriptionReader.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include "HtmlDescriptionReader.h"
+
+#include "../../library/Book.h"
+
+HtmlDescriptionReader::HtmlDescriptionReader(Book &book) : HtmlReader(book.encoding()), myBook(book) {
+ myBook.setTitle("");
+}
+
+void HtmlDescriptionReader::startDocumentHandler() {
+ myReadTitle = false;
+}
+
+void HtmlDescriptionReader::endDocumentHandler() {
+ if (!myBook.title().empty()) {
+ const char *titleStart = myBook.title().data();
+ const char *titleEnd = titleStart + myBook.title().length();
+ std::string newTitle;
+ myConverter->convert(newTitle, titleStart, titleEnd);
+ myBook.setTitle(newTitle);
+ }
+}
+
+bool HtmlDescriptionReader::tagHandler(const HtmlTag &tag) {
+ if (tag.Name == "TITLE") {
+ if (myReadTitle && !tag.Start) {
+ myBook.setTitle(myBuffer);
+ myBuffer.erase();
+ }
+ myReadTitle = tag.Start && myBook.title().empty();
+ return true;
+ } else if (tag.Start && tag.Name == "META") {
+ std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin();
+ for (; it != tag.Attributes.end(); ++it) {
+ if (it->Name == "CONTENT") {
+ break;
+ }
+ }
+ if (it != tag.Attributes.end()) {
+ const std::string prefix = "charset=";
+ std::size_t index = it->Value.find(prefix);
+ if (index != std::string::npos) {
+ std::string charset = it->Value.substr(index + prefix.length());
+ index = charset.find(';');
+ if (index != std::string::npos) {
+ charset = charset.substr(0, index);
+ }
+ index = charset.find(' ');
+ if (index != std::string::npos) {
+ charset = charset.substr(0, index);
+ }
+ myBook.setEncoding(charset);
+ }
+ }
+ }
+ return tag.Name != "BODY";
+}
+
+bool HtmlDescriptionReader::characterDataHandler(const char *text, std::size_t len, bool) {
+ if (myReadTitle) {
+ myBuffer.append(text, len);
+ }
+ return true;
+}
diff --git a/reader/src/formats/html/HtmlDescriptionReader.h b/reader/src/formats/html/HtmlDescriptionReader.h
new file mode 100644
index 0000000..159d4b0
--- /dev/null
+++ b/reader/src/formats/html/HtmlDescriptionReader.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLDESCRIPTIONREADER_H__
+#define __HTMLDESCRIPTIONREADER_H__
+
+#include "HtmlReader.h"
+
+class Book;
+
+class HtmlDescriptionReader : public HtmlReader {
+
+public:
+ HtmlDescriptionReader(Book &book);
+ ~HtmlDescriptionReader();
+
+protected:
+ void startDocumentHandler();
+ void endDocumentHandler();
+
+ bool tagHandler(const HtmlTag &tag);
+ bool characterDataHandler(const char *text, std::size_t len, bool convert);
+
+private:
+ bool myReadTitle;
+ std::string myBuffer;
+ Book &myBook;
+};
+
+inline HtmlDescriptionReader::~HtmlDescriptionReader() {}
+
+#endif /* __HTMLDESCRIPTIONREADER_H__ */
diff --git a/reader/src/formats/html/HtmlEntityCollection.cpp b/reader/src/formats/html/HtmlEntityCollection.cpp
new file mode 100644
index 0000000..bd1bb4e
--- /dev/null
+++ b/reader/src/formats/html/HtmlEntityCollection.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <cstdlib>
+#include <cctype>
+
+#include <ZLibrary.h>
+#include <ZLFile.h>
+#include <ZLXMLReader.h>
+
+#include "HtmlEntityCollection.h"
+
+class CollectionReader : public ZLXMLReader {
+
+public:
+ CollectionReader(std::map<std::string,int> &collection);
+ void startElementHandler(const char *tag, const char **attributes);
+
+private:
+ std::map<std::string,int> &myCollection;
+};
+
+std::map<std::string,int> HtmlEntityCollection::ourCollection;
+
+int HtmlEntityCollection::symbolNumber(const std::string &name) {
+ if (ourCollection.empty()) {
+ CollectionReader(ourCollection).readDocument(ZLFile(
+ ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter +
+ "formats" + ZLibrary::FileNameDelimiter +
+ "html" + ZLibrary::FileNameDelimiter + "html.ent"
+ ));
+ }
+ std::map<std::string,int>::const_iterator it = ourCollection.find(name);
+ return it == ourCollection.end() ? 0 : it->second;
+}
+
+CollectionReader::CollectionReader(std::map<std::string,int> &collection) : myCollection(collection) {
+}
+
+void CollectionReader::startElementHandler(const char *tag, const char **attributes) {
+ static const std::string ENTITY = "entity";
+
+ if (ENTITY == tag) {
+ for (int i = 0; i < 4; ++i) {
+ if (attributes[i] == 0) {
+ return;
+ }
+ }
+ static const std::string _name = "name";
+ static const std::string _number = "number";
+ if (_name == attributes[0] && _number == attributes[2]) {
+ myCollection[attributes[1]] = std::atoi(attributes[3]);
+ }
+ }
+}
diff --git a/reader/src/formats/html/HtmlEntityCollection.h b/reader/src/formats/html/HtmlEntityCollection.h
new file mode 100644
index 0000000..6f70491
--- /dev/null
+++ b/reader/src/formats/html/HtmlEntityCollection.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLENTITYCOLLECTION_H__
+#define __HTMLENTITYCOLLECTION_H__
+
+#include <string>
+#include <map>
+
+class HtmlEntityCollection {
+
+public:
+ static int symbolNumber(const std::string &name);
+
+private:
+ static std::map<std::string,int> ourCollection;
+
+private:
+ HtmlEntityCollection();
+};
+
+#endif /* __HTMLENTITYCOLLECTION_H__ */
diff --git a/reader/src/formats/html/HtmlPlugin.cpp b/reader/src/formats/html/HtmlPlugin.cpp
new file mode 100644
index 0000000..279e096
--- /dev/null
+++ b/reader/src/formats/html/HtmlPlugin.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <ZLStringUtil.h>
+#include <ZLFile.h>
+#include <ZLInputStream.h>
+
+#include "HtmlPlugin.h"
+#include "HtmlDescriptionReader.h"
+#include "HtmlBookReader.h"
+#include "HtmlReaderStream.h"
+#include "../txt/PlainTextFormat.h"
+#include "../util/MiscUtil.h"
+#include "../../library/Book.h"
+#include "../../bookmodel/BookModel.h"
+
+bool HtmlPlugin::acceptsFile(const ZLFile &file) const {
+ const std::string &extension = file.extension();
+ return ZLStringUtil::stringEndsWith(extension, "html") || (extension == "htm");
+}
+
+bool HtmlPlugin::readMetaInfo(Book &book) const {
+ shared_ptr<ZLInputStream> stream = book.file().inputStream();
+ if (stream.isNull()) {
+ return false;
+ }
+
+ shared_ptr<ZLInputStream> htmlStream = new HtmlReaderStream(stream, 50000);
+ detectEncodingAndLanguage(book, *htmlStream);
+ if (book.encoding().empty()) {
+ return false;
+ }
+ HtmlDescriptionReader(book).readDocument(*stream);
+
+ return true;
+}
+
+bool HtmlPlugin::readModel(BookModel &model) const {
+ const Book& book = *model.book();
+ const ZLFile &file = book.file();
+ shared_ptr<ZLInputStream> stream = file.inputStream();
+ if (stream.isNull()) {
+ return false;
+ }
+
+ PlainTextFormat format(file);
+ if (!format.initialized()) {
+ PlainTextFormatDetector detector;
+ detector.detect(*stream, format);
+ }
+
+ std::string directoryPrefix = MiscUtil::htmlDirectoryPrefix(file.path());
+ HtmlBookReader reader(directoryPrefix, model, format, book.encoding());
+ reader.setFileName(MiscUtil::htmlFileName(file.path()));
+ reader.readDocument(*stream);
+
+ return true;
+}
+
+FormatInfoPage *HtmlPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) {
+ return new PlainTextInfoPage(dialog, file, ZLResourceKey("<PRE>"), false);
+}
+
+bool HtmlPlugin::readLanguageAndEncoding(Book &book) const {
+ (void)book;
+ return true;
+}
diff --git a/reader/src/formats/html/HtmlPlugin.h b/reader/src/formats/html/HtmlPlugin.h
new file mode 100644
index 0000000..c66a108
--- /dev/null
+++ b/reader/src/formats/html/HtmlPlugin.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLPLUGIN_H__
+#define __HTMLPLUGIN_H__
+
+#include "../FormatPlugin.h"
+
+class HtmlPlugin : public FormatPlugin {
+
+public:
+ HtmlPlugin();
+ ~HtmlPlugin();
+ bool providesMetaInfo() const;
+ bool acceptsFile(const ZLFile &file) const;
+ bool readMetaInfo(Book &book) const;
+ bool readLanguageAndEncoding(Book &book) const;
+ bool readModel(BookModel &model) const;
+ FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file);
+};
+
+inline HtmlPlugin::HtmlPlugin() {}
+inline HtmlPlugin::~HtmlPlugin() {}
+inline bool HtmlPlugin::providesMetaInfo() const { return false; }
+
+#endif /* __HTMLPLUGIN_H__ */
diff --git a/reader/src/formats/html/HtmlReader.cpp b/reader/src/formats/html/HtmlReader.cpp
new file mode 100644
index 0000000..a5ce7fa
--- /dev/null
+++ b/reader/src/formats/html/HtmlReader.cpp
@@ -0,0 +1,373 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <algorithm>
+#include <cctype>
+
+#include <ZLInputStream.h>
+#include <ZLXMLReader.h>
+#include <ZLFile.h>
+#include <ZLStringUtil.h>
+#include <ZLUnicodeUtil.h>
+
+#include "HtmlReader.h"
+#include "HtmlEntityCollection.h"
+
+HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) {
+}
+
+HtmlReader::~HtmlReader() {
+}
+
+void HtmlReader::setTag(HtmlTag &tag, const std::string &name) {
+ tag.Attributes.clear();
+
+ if (name.length() == 0) {
+ tag.Name = name;
+ return;
+ }
+
+ tag.Start = name[0] != '/';
+ if (tag.Start) {
+ tag.Name = name;
+ } else {
+ tag.Name = name.substr(1);
+ }
+
+ const std::size_t len = tag.Name.length();
+ for (std::size_t i = 0; i < len; ++i) {
+ tag.Name[i] = std::toupper(tag.Name[i]);
+ }
+}
+
+enum ParseState {
+ PS_TEXT,
+ PS_TAGSTART,
+ PS_TAGNAME,
+ PS_WAIT_END_OF_TAG,
+ PS_ATTRIBUTENAME,
+ PS_ATTRIBUTEVALUE,
+ PS_SKIPTAG,
+ PS_COMMENT,
+ PS_SPECIAL,
+ PS_SPECIAL_IN_ATTRIBUTEVALUE,
+};
+
+enum SpecialType {
+ ST_UNKNOWN,
+ ST_NUM,
+ ST_NAME,
+ ST_DEC,
+ ST_HEX
+};
+
+static bool allowSymbol(SpecialType type, char ch) {
+ return
+ (type == ST_NAME && std::isalpha(ch)) ||
+ (type == ST_DEC && std::isdigit(ch)) ||
+ (type == ST_HEX && std::isxdigit(ch));
+}
+
+static int specialSymbolNumber(SpecialType type, const std::string &txt) {
+ char *end = 0;
+ switch (type) {
+ case ST_NAME:
+ return HtmlEntityCollection::symbolNumber(txt);
+ case ST_DEC:
+ return std::strtol(txt.c_str() + 1, &end, 10);
+ case ST_HEX:
+ return std::strtol(txt.c_str() + 2, &end, 16);
+ default:
+ return 0;
+ }
+}
+
+void HtmlReader::appendString(std::string &to, std::string &from) {
+ if (myConverter.isNull()) {
+ to += from;
+ } else {
+ myConverter->convert(to, from);
+ myConverter->reset();
+ }
+ from.erase();
+}
+
+void HtmlReader::readDocument(ZLInputStream &stream) {
+ if (!stream.open()) {
+ return;
+ }
+
+ startDocumentHandler();
+
+ ParseState state = PS_TEXT;
+ SpecialType state_special = ST_UNKNOWN;
+ std::string currentString;
+ std::string attributeValueString;
+ std::string specialString;
+ int quotationCounter = 0;
+ HtmlTag currentTag;
+ char endOfComment[2] = "\0";
+
+ const std::size_t BUFSIZE = 2048;
+ char *buffer = new char[BUFSIZE];
+ std::size_t length;
+ std::size_t offset = 0;
+ do {
+ length = stream.read(buffer, BUFSIZE);
+ char *start = buffer;
+ char *endOfBuffer = buffer + length;
+ for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
+ switch (state) {
+ case PS_TEXT:
+ if (*ptr == '<') {
+ if (!characterDataHandler(start, ptr - start, true)) {
+ goto endOfProcessing;
+ }
+ start = ptr + 1;
+ state = PS_TAGSTART;
+ currentTag.Offset = offset + (ptr - buffer);
+ }
+ if (*ptr == '&') {
+ if (!characterDataHandler(start, ptr - start, true)) {
+ goto endOfProcessing;
+ }
+ start = ptr + 1;
+ state = PS_SPECIAL;
+ state_special = ST_UNKNOWN;
+ }
+ break;
+ case PS_SPECIAL:
+ case PS_SPECIAL_IN_ATTRIBUTEVALUE:
+ if (state_special == ST_UNKNOWN) {
+ if (*ptr == '#') {
+ state_special = ST_NUM;
+ } else if (std::isalpha(*ptr)) {
+ state_special = ST_NAME;
+ } else {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ } else if (state_special == ST_NUM) {
+ if (*ptr == 'x') {
+ state_special = ST_HEX;
+ } else if (std::isdigit(*ptr)) {
+ state_special = ST_DEC;
+ } else {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ } else {
+ if (*ptr == ';') {
+ specialString.append(start, ptr - start);
+ int number = specialSymbolNumber(state_special, specialString);
+ if ((128 <= number) && (number <= 159)) {
+ char ch = number;
+ if (state == PS_SPECIAL) {
+ characterDataHandler(&ch, 1, true);
+ } else {
+ myConverter->convert(attributeValueString, &ch, &ch + 1);
+ }
+ } else if (number != 0) {
+ char buffer[4];
+ int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
+ if (state == PS_SPECIAL) {
+ characterDataHandler(buffer, len, false);
+ } else {
+ attributeValueString.append(buffer, len);
+ }
+ } else {
+ specialString = "&" + specialString + ";";
+ if (state == PS_SPECIAL) {
+ characterDataHandler(specialString.c_str(), specialString.length(), false);
+ } else {
+ attributeValueString += specialString;
+ }
+ }
+ specialString.erase();
+ start = ptr + 1;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ } else if (!allowSymbol(state_special, *ptr)) {
+ start = ptr;
+ state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
+ }
+ }
+ break;
+ case PS_TAGSTART:
+ state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
+ break;
+ case PS_COMMENT:
+ if ((endOfComment[0] == '\0') && (*ptr != '-')) {
+ state = PS_TAGNAME;
+ } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
+ start = ptr + 1;
+ state = PS_TEXT;
+ endOfComment[0] = '\0';
+ endOfComment[1] = '\0';
+ } else {
+ endOfComment[0] = endOfComment[1];
+ endOfComment[1] = *ptr;
+ }
+ break;
+ case PS_WAIT_END_OF_TAG:
+ if (*ptr == '>') {
+ start = ptr + 1;
+ state = PS_TEXT;
+ }
+ break;
+ case PS_TAGNAME:
+ if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
+ currentString.append(start, ptr - start);
+ start = ptr + 1;
+ setTag(currentTag, currentString);
+ currentString.erase();
+ if (currentTag.Name == "") {
+ state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
+ } else {
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = PS_ATTRIBUTENAME;
+ }
+ }
+ }
+ break;
+ case PS_ATTRIBUTENAME:
+ if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
+ if (ptr != start || !currentString.empty()) {
+ currentString.append(start, ptr - start);
+ for (unsigned int i = 0; i < currentString.length(); ++i) {
+ currentString[i] = std::toupper(currentString[i]);
+ }
+ currentTag.addAttribute(currentString);
+ currentString.erase();
+ }
+ start = ptr + 1;
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
+ }
+ }
+ break;
+ case PS_ATTRIBUTEVALUE:
+ if (*ptr == '"') {
+ if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
+ ++quotationCounter;
+ }
+ } else if (*ptr == '&') {
+ currentString.append(start, ptr - start);
+ start = ptr + 1;
+ appendString(attributeValueString, currentString);
+ state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
+ state_special = ST_UNKNOWN;
+ } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
+ if (ptr != start || !currentString.empty()) {
+ currentString.append(start, ptr - start);
+ appendString(attributeValueString, currentString);
+ if (attributeValueString[0] == '"') {
+ attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
+ }
+ currentTag.setLastAttributeValue(attributeValueString);
+ attributeValueString.erase();
+ quotationCounter = 0;
+ }
+ start = ptr + 1;
+ if (*ptr == '>') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_TEXT;
+ } else if (*ptr == '/') {
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ currentTag.Start = false;
+ if (!tagHandler(currentTag)) {
+ goto endOfProcessing;
+ }
+ state = PS_WAIT_END_OF_TAG;
+ } else {
+ state = PS_ATTRIBUTENAME;
+ }
+ }
+ break;
+ case PS_SKIPTAG:
+ if (*ptr == '>') {
+ start = ptr + 1;
+ state = PS_TEXT;
+ }
+ break;
+ }
+ }
+ if (start != endOfBuffer) {
+ switch (state) {
+ case PS_TEXT:
+ if (!characterDataHandler(start, endOfBuffer - start, true)) {
+ goto endOfProcessing;
+ }
+ break;
+ case PS_TAGNAME:
+ case PS_ATTRIBUTENAME:
+ case PS_ATTRIBUTEVALUE:
+ currentString.append(start, endOfBuffer - start);
+ break;
+ case PS_SPECIAL:
+ case PS_SPECIAL_IN_ATTRIBUTEVALUE:
+ specialString.append(start, endOfBuffer - start);
+ break;
+ case PS_TAGSTART:
+ case PS_SKIPTAG:
+ case PS_COMMENT:
+ case PS_WAIT_END_OF_TAG:
+ break;
+ }
+ }
+ offset += length;
+ } while (length == BUFSIZE);
+endOfProcessing:
+ delete[] buffer;
+
+ endDocumentHandler();
+
+ stream.close();
+}
diff --git a/reader/src/formats/html/HtmlReader.h b/reader/src/formats/html/HtmlReader.h
new file mode 100644
index 0000000..876fad8
--- /dev/null
+++ b/reader/src/formats/html/HtmlReader.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLREADER_H__
+#define __HTMLREADER_H__
+
+#include <string>
+#include <vector>
+
+#include <ZLEncodingConverter.h>
+#include "../EncodedTextReader.h"
+
+class ZLInputStream;
+
+class HtmlReader : public EncodedTextReader {
+
+public:
+ struct HtmlAttribute {
+ std::string Name;
+ std::string Value;
+ bool HasValue;
+
+ HtmlAttribute(const std::string &name);
+ ~HtmlAttribute();
+ void setValue(const std::string &value);
+ };
+
+ struct HtmlTag {
+ std::string Name;
+ std::size_t Offset;
+ bool Start;
+ std::vector<HtmlAttribute> Attributes;
+
+ HtmlTag();
+ ~HtmlTag();
+ void addAttribute(const std::string &name);
+ void setLastAttributeValue(const std::string &value);
+
+ private:
+ HtmlTag(const HtmlTag&);
+ const HtmlTag &operator = (const HtmlTag&);
+ };
+
+private:
+ static void setTag(HtmlTag &tag, const std::string &fullName);
+
+public:
+ virtual void readDocument(ZLInputStream &stream);
+
+protected:
+ HtmlReader(const std::string &encoding);
+ virtual ~HtmlReader();
+
+protected:
+ virtual void startDocumentHandler() = 0;
+ virtual void endDocumentHandler() = 0;
+
+ // returns false iff processing must be stopped
+ virtual bool tagHandler(const HtmlTag &tag) = 0;
+ // returns false iff processing must be stopped
+ virtual bool characterDataHandler(const char *text, std::size_t len, bool convert) = 0;
+
+private:
+ void appendString(std::string &to, std::string &from);
+};
+
+inline HtmlReader::HtmlAttribute::HtmlAttribute(const std::string &name) : Name(name), HasValue(false) {}
+inline HtmlReader::HtmlAttribute::~HtmlAttribute() {}
+inline void HtmlReader::HtmlAttribute::setValue(const std::string &value) { Value = value; HasValue = true; }
+
+inline HtmlReader::HtmlTag::HtmlTag() : Start(true) {}
+inline HtmlReader::HtmlTag::~HtmlTag() {}
+inline void HtmlReader::HtmlTag::addAttribute(const std::string &name) { Attributes.push_back(HtmlAttribute(name)); }
+inline void HtmlReader::HtmlTag::setLastAttributeValue(const std::string &value) { if (!Attributes.empty()) Attributes.back().setValue(value); }
+
+#endif /* __HTMLREADER_H__ */
diff --git a/reader/src/formats/html/HtmlReaderStream.cpp b/reader/src/formats/html/HtmlReaderStream.cpp
new file mode 100644
index 0000000..08c43ae
--- /dev/null
+++ b/reader/src/formats/html/HtmlReaderStream.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2008-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+#include "HtmlReaderStream.h"
+#include "HtmlReader.h"
+
+class HtmlTextOnlyReader : public HtmlReader {
+
+public:
+ HtmlTextOnlyReader(char *buffer, std::size_t maxSize);
+ std::size_t size() const;
+
+private:
+ void startDocumentHandler();
+ void endDocumentHandler();
+
+ bool tagHandler(const HtmlTag &tag);
+ bool characterDataHandler(const char *text, std::size_t len, bool convert);
+
+private:
+ char *myBuffer;
+ std::size_t myMaxSize;
+ std::size_t myFilledSize;
+ bool myIgnoreText;
+};
+
+HtmlTextOnlyReader::HtmlTextOnlyReader(char *buffer, std::size_t maxSize) : HtmlReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0), myIgnoreText(false) {
+}
+
+std::size_t HtmlTextOnlyReader::size() const {
+ return myFilledSize;
+}
+
+void HtmlTextOnlyReader::startDocumentHandler() {
+}
+
+void HtmlTextOnlyReader::endDocumentHandler() {
+}
+
+bool HtmlTextOnlyReader::tagHandler(const HtmlTag &tag) {
+ if (tag.Name == "SCRIPT") {
+ myIgnoreText = tag.Start;
+ }
+ if ((myFilledSize < myMaxSize) && (myFilledSize > 0) && (myBuffer[myFilledSize - 1] != '\n')) {
+ myBuffer[myFilledSize++] = '\n';
+ }
+ return myFilledSize < myMaxSize;
+}
+
+bool HtmlTextOnlyReader::characterDataHandler(const char *text, std::size_t len, bool) {
+ if (!myIgnoreText) {
+ len = std::min((std::size_t)len, myMaxSize - myFilledSize);
+ std::memcpy(myBuffer + myFilledSize, text, len);
+ myFilledSize += len;
+ }
+ return myFilledSize < myMaxSize;
+}
+
+HtmlReaderStream::HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize) : myBase(base), myBuffer(0), mySize(maxSize) {
+}
+
+HtmlReaderStream::~HtmlReaderStream() {
+ close();
+}
+
+bool HtmlReaderStream::open() {
+ if (myBase.isNull() || !myBase->open()) {
+ return false;
+ }
+ myBuffer = new char[mySize];
+ HtmlTextOnlyReader reader(myBuffer, mySize);
+ reader.readDocument(*myBase);
+ mySize = reader.size();
+ myOffset = 0;
+ myBase->close();
+ return true;
+}
+
+std::size_t HtmlReaderStream::read(char *buffer, std::size_t maxSize) {
+ maxSize = std::min(maxSize, mySize - myOffset);
+ if (buffer != 0) {
+ std::memcpy(buffer, myBuffer, maxSize);
+ }
+ myOffset += maxSize;
+ return maxSize;
+}
+
+void HtmlReaderStream::close() {
+ if (myBuffer != 0) {
+ delete[] myBuffer;
+ myBuffer = 0;
+ }
+}
+
+void HtmlReaderStream::seek(int offset, bool absoluteOffset) {
+ if (!absoluteOffset) {
+ offset += myOffset;
+ }
+ myOffset = std::min(mySize, (std::size_t)std::max(0, offset));
+}
+
+std::size_t HtmlReaderStream::offset() const {
+ return myOffset;
+}
+
+std::size_t HtmlReaderStream::sizeOfOpened() {
+ return mySize;
+}
diff --git a/reader/src/formats/html/HtmlReaderStream.h b/reader/src/formats/html/HtmlReaderStream.h
new file mode 100644
index 0000000..c5c15b8
--- /dev/null
+++ b/reader/src/formats/html/HtmlReaderStream.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2008-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLREADERSTREAM_H__
+#define __HTMLREADERSTREAM_H__
+
+#include <shared_ptr.h>
+#include <ZLInputStream.h>
+
+class HtmlReaderStream : public ZLInputStream {
+
+public:
+ HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize);
+ ~HtmlReaderStream();
+
+private:
+ bool open();
+ std::size_t read(char *buffer, std::size_t maxSize);
+ void close();
+
+ void seek(int offset, bool absoluteOffset);
+ std::size_t offset() const;
+ std::size_t sizeOfOpened();
+
+private:
+ shared_ptr<ZLInputStream> myBase;
+ char *myBuffer;
+ std::size_t mySize;
+ std::size_t myOffset;
+};
+
+#endif /* __HTMLREADERSTREAM_H__ */
diff --git a/reader/src/formats/html/HtmlTagActions.h b/reader/src/formats/html/HtmlTagActions.h
new file mode 100644
index 0000000..7da3f20
--- /dev/null
+++ b/reader/src/formats/html/HtmlTagActions.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#ifndef __HTMLTAGACTIONS_H__
+#define __HTMLTAGACTIONS_H__
+
+#include <set>
+
+#include "HtmlBookReader.h"
+
+class HtmlTagAction {
+
+protected:
+ HtmlTagAction(HtmlBookReader &reader);
+
+public:
+ virtual ~HtmlTagAction();
+ virtual void run(const HtmlReader::HtmlTag &tag) = 0;
+ virtual void reset();
+
+protected:
+ BookReader &bookReader();
+
+protected:
+ HtmlBookReader &myReader;
+};
+
+class DummyHtmlTagAction : public HtmlTagAction {
+
+public:
+ DummyHtmlTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+class HtmlControlTagAction : public HtmlTagAction {
+
+public:
+ HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind);
+ void run(const HtmlReader::HtmlTag &tag);
+
+private:
+ FBTextKind myKind;
+};
+
+class HtmlHeaderTagAction : public HtmlTagAction {
+
+public:
+ HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind);
+ void run(const HtmlReader::HtmlTag &tag);
+
+private:
+ FBTextKind myKind;
+};
+
+class HtmlIgnoreTagAction : public HtmlTagAction {
+
+public:
+ HtmlIgnoreTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+
+private:
+ std::set<std::string> myTagNames;
+};
+
+class HtmlHrefTagAction : public HtmlTagAction {
+
+public:
+ HtmlHrefTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+ void reset();
+
+protected:
+ FBTextKind hyperlinkType() const;
+ void setHyperlinkType(FBTextKind hyperlinkType);
+
+private:
+ FBTextKind myHyperlinkType;
+};
+
+class HtmlImageTagAction : public HtmlTagAction {
+
+public:
+ HtmlImageTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+class HtmlBreakTagAction : public HtmlTagAction {
+
+public:
+ enum BreakType {
+ BREAK_AT_START = 1,
+ BREAK_AT_END = 2,
+ BREAK_AT_START_AND_AT_END = BREAK_AT_START | BREAK_AT_END
+ };
+ HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType);
+ void run(const HtmlReader::HtmlTag &tag);
+
+private:
+ BreakType myBreakType;
+};
+
+class HtmlPreTagAction : public HtmlTagAction {
+
+public:
+ HtmlPreTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+class HtmlListTagAction : public HtmlTagAction {
+
+public:
+ HtmlListTagAction(HtmlBookReader &reader, int startIndex);
+ void run(const HtmlReader::HtmlTag &tag);
+
+private:
+ int myStartIndex;
+};
+
+class HtmlListItemTagAction : public HtmlTagAction {
+
+public:
+ HtmlListItemTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+class HtmlTableTagAction : public HtmlTagAction {
+
+public:
+ HtmlTableTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+class HtmlStyleTagAction : public HtmlTagAction {
+
+public:
+ HtmlStyleTagAction(HtmlBookReader &reader);
+ void run(const HtmlReader::HtmlTag &tag);
+};
+
+inline BookReader &HtmlTagAction::bookReader() { return myReader.myBookReader; }
+
+#endif /* __HTMLTAGACTIONS_H__ */