diff options
Diffstat (limited to 'reader/src/formats/html')
-rw-r--r-- | reader/src/formats/html/HtmlBookReader.cpp | 583 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlBookReader.h | 101 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlDescriptionReader.cpp | 82 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlDescriptionReader.h | 48 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlEntityCollection.cpp | 71 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlEntityCollection.h | 38 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlPlugin.cpp | 83 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlPlugin.h | 42 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlReader.cpp | 373 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlReader.h | 92 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlReaderStream.cpp | 128 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlReaderStream.h | 48 | ||||
-rw-r--r-- | reader/src/formats/html/HtmlTagActions.h | 158 |
13 files changed, 1847 insertions, 0 deletions
diff --git a/reader/src/formats/html/HtmlBookReader.cpp b/reader/src/formats/html/HtmlBookReader.cpp new file mode 100644 index 0000000..321913d --- /dev/null +++ b/reader/src/formats/html/HtmlBookReader.cpp @@ -0,0 +1,583 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLStringUtil.h> + +#include "HtmlBookReader.h" +#include "HtmlTagActions.h" +#include "../txt/PlainTextFormat.h" +#include "../util/MiscUtil.h" +#include "../../bookmodel/BookModel.h" +#include "../css/StyleSheetParser.h" + +HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) { +} + +HtmlTagAction::~HtmlTagAction() { +} + +void HtmlTagAction::reset() { +} + +DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) { +} + +HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) { + std::vector<FBTextKind> &list = myReader.myKindList; + int index; + for (index = list.size() - 1; index >= 0; --index) { + if (list[index] == myKind) { + break; + } + } + if (tag.Start) { + if (index == -1) { + bookReader().pushKind(myKind); + myReader.myKindList.push_back(myKind); + bookReader().addControl(myKind, true); + } + } else { + if (index >= 0) { + for (int i = list.size() - 1; i >= index; --i) { + bookReader().addControl(list[i], false); + bookReader().popKind(); + } + for (unsigned int j = index + 1; j < list.size(); ++j) { + bookReader().addControl(list[j], true); + bookReader().pushKind(list[j]); + } + list.erase(list.begin() + index); + } + } +} + +HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myIsStarted = false; + if (tag.Start) { + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + if (!bookReader().contentsParagraphIsOpen()) { + bookReader().insertEndOfSectionParagraph(); + bookReader().enterTitle(); + bookReader().beginContentsParagraph(); + } + } + bookReader().pushKind(myKind); + } else { + bookReader().popKind(); + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + bookReader().endContentsParagraph(); + bookReader().exitTitle(); + } + } + bookReader().beginParagraph(); +} + +HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + if (myTagNames.find(tag.Name) == myTagNames.end()) { + ++myReader.myIgnoreDataCounter; + myTagNames.insert(tag.Name); + } + } else { + if (myTagNames.find(tag.Name) != myTagNames.end()) { + --myReader.myIgnoreDataCounter; + myTagNames.erase(tag.Name); + } + } +} + +HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "NAME") { + bookReader().addHyperlinkLabel(tag.Attributes[i].Value); + } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) { + std::string value = tag.Attributes[i].Value; + if (!myReader.myFileName.empty() && + (value.length() > myReader.myFileName.length()) && + (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) { + value = value.substr(myReader.myFileName.length()); + } + if (!value.empty()) { + if (value[0] == '#') { + setHyperlinkType(INTERNAL_HYPERLINK); + bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1)); + } else { + FBTextKind hyperlinkType = MiscUtil::referenceType(value); + if (hyperlinkType != INTERNAL_HYPERLINK) { + setHyperlinkType(hyperlinkType); + bookReader().addHyperlinkControl(hyperlinkType, value); + } + } + } + } + } + } else if (hyperlinkType() != REGULAR) { + bookReader().addControl(hyperlinkType(), false); + setHyperlinkType(REGULAR); + } +} + +void HtmlHrefTagAction::reset() { + setHyperlinkType(REGULAR); +} + +FBTextKind HtmlHrefTagAction::hyperlinkType() const { + return myHyperlinkType; +} + +void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) { + myHyperlinkType = hyperlinkType; +} + +HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "SRC") { + const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value); + const ZLFile file(myReader.myBaseDirPath + fileName); + if (file.exists()) { + bookReader().addImageReference(fileName); + bookReader().addImage(fileName, new ZLFileImage(file, 0)); + } + break; + } + } + bookReader().beginParagraph(); + } +} + +HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) { +} + +void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) { + if (myReader.myDontBreakParagraph) { + myReader.myDontBreakParagraph = false; + return; + } + + if ((tag.Start && (myBreakType & BREAK_AT_START)) || + (!tag.Start && (myBreakType & BREAK_AT_END))) { + bookReader().endParagraph(); + if (bookReader().isKindStackEmpty()) { + bookReader().pushKind(REGULAR); + } + bookReader().beginParagraph(); + } +} + +HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) { + bookReader().endParagraph(); + myReader.myIsPreformatted = tag.Start; + myReader.mySpaceCounter = -1; + myReader.myBreakCounter = 0; + if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + if (tag.Start) { + bookReader().pushKind(PREFORMATTED); + } else { + bookReader().popKind(); + } + } + bookReader().beginParagraph(); +} + +HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) { +} + +void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myListNumStack.push(myStartIndex); + } else if (!myReader.myListNumStack.empty()) { + myReader.myListNumStack.pop(); + } +} + +HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + bookReader().beginParagraph(); + if (!myReader.myListNumStack.empty()) { + bookReader().addFixedHSpace(3 * myReader.myListNumStack.size()); + int &index = myReader.myListNumStack.top(); + if (index == 0) { + myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false); + } else { + std::string number; + ZLStringUtil::appendNumber(number, index++); + number += ". "; + myReader.addConvertedDataToBuffer(number.data(), number.length(), false); + } + myReader.myDontBreakParagraph = true; + } + } else { + myReader.myDontBreakParagraph = false; + } +} + +HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myIgnoreTitles = true; + } else { + myReader.myIgnoreTitles = false; + } +} + +HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0; + /* + if (!tag.Start) { + myReader.myStyleSheetTable.dump(); + } + */ +} + +shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) { + if (tag == "EM") { + return new HtmlControlTagAction(*this, EMPHASIS); + } else if (tag == "STRONG") { + return new HtmlControlTagAction(*this, STRONG); + } else if (tag == "B") { + return new HtmlControlTagAction(*this, BOLD); + } else if (tag == "I") { + return new HtmlControlTagAction(*this, ITALIC); + } else if (tag == "TT") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CODE") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CITE") { + return new HtmlControlTagAction(*this, CITE); + } else if (tag == "SUB") { + return new HtmlControlTagAction(*this, SUB); + } else if (tag == "SUP") { + return new HtmlControlTagAction(*this, SUP); + } else if (tag == "H1") { + return new HtmlHeaderTagAction(*this, H1); + } else if (tag == "H2") { + return new HtmlHeaderTagAction(*this, H2); + } else if (tag == "H3") { + return new HtmlHeaderTagAction(*this, H3); + } else if (tag == "H4") { + return new HtmlHeaderTagAction(*this, H4); + } else if (tag == "H5") { + return new HtmlHeaderTagAction(*this, H5); + } else if (tag == "H6") { + return new HtmlHeaderTagAction(*this, H6); + } else if (tag == "HEAD") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "TITLE") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "STYLE") { + return new HtmlStyleTagAction(*this); + } else if (tag == "SELECT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "SCRIPT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "A") { + return new HtmlHrefTagAction(*this); + } else if (tag == "TD") { + //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "TR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DIV") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DT") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START); + } else if (tag == "P") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "BR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "IMG") { + return new HtmlImageTagAction(*this); + } else if (tag == "UL") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "MENU") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "DIR") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "OL") { + return new HtmlListTagAction(*this, 1); + } else if (tag == "LI") { + return new HtmlListItemTagAction(*this); + } else if (tag == "PRE") { + if (myProcessPreTag) { + return new HtmlPreTagAction(*this); + } + } else if (tag == "TABLE") { + return new HtmlTableTagAction(*this); + } + /* + } else if (tag == "DD") { + return 0; + } else if (tag == "DL") { + return 0; + } else if (tag == "DFN") { + return 0; + } else if (tag == "SAMP") { + return 0; + } else if (tag == "KBD") { + return 0; + } else if (tag == "VAR") { + return 0; + } else if (tag == "ABBR") { + return 0; + } else if (tag == "ACRONYM") { + return 0; + } else if (tag == "BLOCKQUOTE") { + return 0; + } else if (tag == "Q") { + return 0; + } else if (tag == "INS") { + return 0; + } else if (tag == "DEL") { + return 0; + } else if (tag == "BODY") { + return 0; + */ + return new DummyHtmlTagAction(*this); +} + +void HtmlBookReader::setBuildTableOfContent(bool build) { + myBuildTableOfContent = build; +} + +void HtmlBookReader::setProcessPreTag(bool process) { + myProcessPreTag = process; +} + +HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) { +} + +HtmlBookReader::~HtmlBookReader() { +} + +void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) { + if (len > 0) { + if (myDontBreakParagraph) { + while (len > 0 && std::isspace(*text)) { + --len; + ++text; + } + if (len == 0) { + return; + } + } + if (convert) { + myConverter->convert(myConverterBuffer, text, text + len); + myBookReader.addData(myConverterBuffer); + myBookReader.addContentsData(myConverterBuffer); + myConverterBuffer.erase(); + } else { + std::string strText(text, len); + myBookReader.addData(strText); + myBookReader.addContentsData(strText); + } + myDontBreakParagraph = false; + } +} + +bool HtmlBookReader::tagHandler(const HtmlTag &tag) { + myConverter->reset(); + + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "ID") { + myBookReader.addHyperlinkLabel(tag.Attributes[i].Value); + break; + } + } + shared_ptr<HtmlTagAction> action = myActionMap[tag.Name]; + if (action.isNull()) { + action = createAction(tag.Name); + myActionMap[tag.Name] = action; + } + action->run(tag); + + return true; +} + +void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) { + const char *start = text; + const char *end = text + len; + + int breakType = myFormat.breakType(); + if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (*ptr == '\n') { + mySpaceCounter = 0; + if (start < ptr) { + addConvertedDataToBuffer(start, ptr - start, convert); + } else { + static const std::string SPACE = " "; + myBookReader.addData(SPACE); + } + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr + 1; + } else if (mySpaceCounter >= 0) { + if (std::isspace((unsigned char)*ptr)) { + ++mySpaceCounter; + } else { + myBookReader.addFixedHSpace(mySpaceCounter); + mySpaceCounter = -1; + } + } + } + addConvertedDataToBuffer(start, end - start, convert); + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + mySpaceCounter = 0; + } else if (mySpaceCounter >= 0) { + ++mySpaceCounter; + } + } else { + if (mySpaceCounter > myFormat.ignoredIndent()) { + if (ptr - start > mySpaceCounter) { + addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + } + start = ptr; + } + mySpaceCounter = -1; + } + } + mySpaceCounter = std::max(mySpaceCounter, 0); + if (end - start > mySpaceCounter) { + addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert); + } + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) { + for (const char *ptr = start; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + ++myBreakCounter; + } + } else { + if (myBreakCounter > 1) { + addConvertedDataToBuffer(start, ptr - start, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr; + } + myBreakCounter = 0; + } + } + addConvertedDataToBuffer(start, end - start, convert); + } +} + +bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) { + if (!myStyleSheetParser.isNull()) { + myStyleSheetParser->parse(text, len); + return true; + } + + if (myIgnoreDataCounter != 0) { + return true; + } + + if (myIsPreformatted) { + preformattedCharacterDataHandler(text, len, convert); + return true; + } + + const char *ptr = text; + const char *end = text + len; + if (!myIsStarted) { + for (; ptr != end; ++ptr) { + if (!std::isspace((unsigned char)*ptr)) { + myIsStarted = true; + break; + } + } + } + if (myIsStarted) { + addConvertedDataToBuffer(ptr, end - ptr, convert); + } + return true; +} + +void HtmlBookReader::startDocumentHandler() { + while (!myListNumStack.empty()) { + myListNumStack.pop(); + } + myConverterBuffer.erase(); + myKindList.clear(); + + myBookReader.reset(); + myBookReader.setMainTextModel(); + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + myIgnoreDataCounter = 0; + myIsPreformatted = false; + myDontBreakParagraph = false; + for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) { + it->second->reset(); + } + myIsStarted = false; + myIgnoreTitles = false; + + myStyleSheetParser = 0; + + mySpaceCounter = -1; + myBreakCounter = 0; +} + +void HtmlBookReader::endDocumentHandler() { + myBookReader.endParagraph(); +} + +void HtmlBookReader::setFileName(const std::string fileName) { + myFileName = fileName; +} diff --git a/reader/src/formats/html/HtmlBookReader.h b/reader/src/formats/html/HtmlBookReader.h new file mode 100644 index 0000000..c8d4e32 --- /dev/null +++ b/reader/src/formats/html/HtmlBookReader.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLBOOKREADER_H__ +#define __HTMLBOOKREADER_H__ + +#include <stack> + +#include <shared_ptr.h> + +#include "HtmlReader.h" +#include "../../bookmodel/BookReader.h" +#include "../css/StyleSheetTable.h" + +class BookModel; +class PlainTextFormat; +class StyleSheetParser; + +class HtmlTagAction; + +class HtmlBookReader : public HtmlReader { + +public: + HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding); + ~HtmlBookReader(); + void setFileName(const std::string fileName); + +protected: + virtual shared_ptr<HtmlTagAction> createAction(const std::string &tag); + void setBuildTableOfContent(bool build); + void setProcessPreTag(bool process); + +protected: + void startDocumentHandler(); + void endDocumentHandler(); + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + void preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert); + void addConvertedDataToBuffer(const char *text, std::size_t len, bool convert); + +protected: + BookReader myBookReader; + std::string myBaseDirPath; + +private: + const PlainTextFormat &myFormat; + int myIgnoreDataCounter; + bool myIsPreformatted; + bool myDontBreakParagraph; + + bool myIsStarted; + bool myBuildTableOfContent; + bool myProcessPreTag; + bool myIgnoreTitles; + std::stack<int> myListNumStack; + + StyleSheetTable myStyleSheetTable; + shared_ptr<StyleSheetParser> myStyleSheetParser; + + int mySpaceCounter; + int myBreakCounter; + std::string myConverterBuffer; + + std::map<std::string,shared_ptr<HtmlTagAction> > myActionMap; + std::vector<FBTextKind> myKindList; + + std::string myFileName; + + friend class HtmlTagAction; + friend class HtmlControlTagAction; + friend class HtmlHeaderTagAction; + friend class HtmlIgnoreTagAction; + friend class HtmlHrefTagAction; + friend class HtmlImageTagAction; + friend class HtmlBreakTagAction; + friend class HtmlPreTagAction; + friend class HtmlListTagAction; + friend class HtmlListItemTagAction; + friend class HtmlTableTagAction; + friend class HtmlStyleTagAction; +}; + +#endif /* __HTMLBOOKREADER_H__ */ diff --git a/reader/src/formats/html/HtmlDescriptionReader.cpp b/reader/src/formats/html/HtmlDescriptionReader.cpp new file mode 100644 index 0000000..6ebcb8b --- /dev/null +++ b/reader/src/formats/html/HtmlDescriptionReader.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "HtmlDescriptionReader.h" + +#include "../../library/Book.h" + +HtmlDescriptionReader::HtmlDescriptionReader(Book &book) : HtmlReader(book.encoding()), myBook(book) { + myBook.setTitle(""); +} + +void HtmlDescriptionReader::startDocumentHandler() { + myReadTitle = false; +} + +void HtmlDescriptionReader::endDocumentHandler() { + if (!myBook.title().empty()) { + const char *titleStart = myBook.title().data(); + const char *titleEnd = titleStart + myBook.title().length(); + std::string newTitle; + myConverter->convert(newTitle, titleStart, titleEnd); + myBook.setTitle(newTitle); + } +} + +bool HtmlDescriptionReader::tagHandler(const HtmlTag &tag) { + if (tag.Name == "TITLE") { + if (myReadTitle && !tag.Start) { + myBook.setTitle(myBuffer); + myBuffer.erase(); + } + myReadTitle = tag.Start && myBook.title().empty(); + return true; + } else if (tag.Start && tag.Name == "META") { + std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin(); + for (; it != tag.Attributes.end(); ++it) { + if (it->Name == "CONTENT") { + break; + } + } + if (it != tag.Attributes.end()) { + const std::string prefix = "charset="; + std::size_t index = it->Value.find(prefix); + if (index != std::string::npos) { + std::string charset = it->Value.substr(index + prefix.length()); + index = charset.find(';'); + if (index != std::string::npos) { + charset = charset.substr(0, index); + } + index = charset.find(' '); + if (index != std::string::npos) { + charset = charset.substr(0, index); + } + myBook.setEncoding(charset); + } + } + } + return tag.Name != "BODY"; +} + +bool HtmlDescriptionReader::characterDataHandler(const char *text, std::size_t len, bool) { + if (myReadTitle) { + myBuffer.append(text, len); + } + return true; +} diff --git a/reader/src/formats/html/HtmlDescriptionReader.h b/reader/src/formats/html/HtmlDescriptionReader.h new file mode 100644 index 0000000..159d4b0 --- /dev/null +++ b/reader/src/formats/html/HtmlDescriptionReader.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLDESCRIPTIONREADER_H__ +#define __HTMLDESCRIPTIONREADER_H__ + +#include "HtmlReader.h" + +class Book; + +class HtmlDescriptionReader : public HtmlReader { + +public: + HtmlDescriptionReader(Book &book); + ~HtmlDescriptionReader(); + +protected: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + bool myReadTitle; + std::string myBuffer; + Book &myBook; +}; + +inline HtmlDescriptionReader::~HtmlDescriptionReader() {} + +#endif /* __HTMLDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/html/HtmlEntityCollection.cpp b/reader/src/formats/html/HtmlEntityCollection.cpp new file mode 100644 index 0000000..bd1bb4e --- /dev/null +++ b/reader/src/formats/html/HtmlEntityCollection.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cctype> + +#include <ZLibrary.h> +#include <ZLFile.h> +#include <ZLXMLReader.h> + +#include "HtmlEntityCollection.h" + +class CollectionReader : public ZLXMLReader { + +public: + CollectionReader(std::map<std::string,int> &collection); + void startElementHandler(const char *tag, const char **attributes); + +private: + std::map<std::string,int> &myCollection; +}; + +std::map<std::string,int> HtmlEntityCollection::ourCollection; + +int HtmlEntityCollection::symbolNumber(const std::string &name) { + if (ourCollection.empty()) { + CollectionReader(ourCollection).readDocument(ZLFile( + ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter + + "formats" + ZLibrary::FileNameDelimiter + + "html" + ZLibrary::FileNameDelimiter + "html.ent" + )); + } + std::map<std::string,int>::const_iterator it = ourCollection.find(name); + return it == ourCollection.end() ? 0 : it->second; +} + +CollectionReader::CollectionReader(std::map<std::string,int> &collection) : myCollection(collection) { +} + +void CollectionReader::startElementHandler(const char *tag, const char **attributes) { + static const std::string ENTITY = "entity"; + + if (ENTITY == tag) { + for (int i = 0; i < 4; ++i) { + if (attributes[i] == 0) { + return; + } + } + static const std::string _name = "name"; + static const std::string _number = "number"; + if (_name == attributes[0] && _number == attributes[2]) { + myCollection[attributes[1]] = std::atoi(attributes[3]); + } + } +} diff --git a/reader/src/formats/html/HtmlEntityCollection.h b/reader/src/formats/html/HtmlEntityCollection.h new file mode 100644 index 0000000..6f70491 --- /dev/null +++ b/reader/src/formats/html/HtmlEntityCollection.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLENTITYCOLLECTION_H__ +#define __HTMLENTITYCOLLECTION_H__ + +#include <string> +#include <map> + +class HtmlEntityCollection { + +public: + static int symbolNumber(const std::string &name); + +private: + static std::map<std::string,int> ourCollection; + +private: + HtmlEntityCollection(); +}; + +#endif /* __HTMLENTITYCOLLECTION_H__ */ diff --git a/reader/src/formats/html/HtmlPlugin.cpp b/reader/src/formats/html/HtmlPlugin.cpp new file mode 100644 index 0000000..279e096 --- /dev/null +++ b/reader/src/formats/html/HtmlPlugin.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLStringUtil.h> +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "HtmlPlugin.h" +#include "HtmlDescriptionReader.h" +#include "HtmlBookReader.h" +#include "HtmlReaderStream.h" +#include "../txt/PlainTextFormat.h" +#include "../util/MiscUtil.h" +#include "../../library/Book.h" +#include "../../bookmodel/BookModel.h" + +bool HtmlPlugin::acceptsFile(const ZLFile &file) const { + const std::string &extension = file.extension(); + return ZLStringUtil::stringEndsWith(extension, "html") || (extension == "htm"); +} + +bool HtmlPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = book.file().inputStream(); + if (stream.isNull()) { + return false; + } + + shared_ptr<ZLInputStream> htmlStream = new HtmlReaderStream(stream, 50000); + detectEncodingAndLanguage(book, *htmlStream); + if (book.encoding().empty()) { + return false; + } + HtmlDescriptionReader(book).readDocument(*stream); + + return true; +} + +bool HtmlPlugin::readModel(BookModel &model) const { + const Book& book = *model.book(); + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull()) { + return false; + } + + PlainTextFormat format(file); + if (!format.initialized()) { + PlainTextFormatDetector detector; + detector.detect(*stream, format); + } + + std::string directoryPrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + HtmlBookReader reader(directoryPrefix, model, format, book.encoding()); + reader.setFileName(MiscUtil::htmlFileName(file.path())); + reader.readDocument(*stream); + + return true; +} + +FormatInfoPage *HtmlPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + return new PlainTextInfoPage(dialog, file, ZLResourceKey("<PRE>"), false); +} + +bool HtmlPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} diff --git a/reader/src/formats/html/HtmlPlugin.h b/reader/src/formats/html/HtmlPlugin.h new file mode 100644 index 0000000..c66a108 --- /dev/null +++ b/reader/src/formats/html/HtmlPlugin.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLPLUGIN_H__ +#define __HTMLPLUGIN_H__ + +#include "../FormatPlugin.h" + +class HtmlPlugin : public FormatPlugin { + +public: + HtmlPlugin(); + ~HtmlPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +inline HtmlPlugin::HtmlPlugin() {} +inline HtmlPlugin::~HtmlPlugin() {} +inline bool HtmlPlugin::providesMetaInfo() const { return false; } + +#endif /* __HTMLPLUGIN_H__ */ diff --git a/reader/src/formats/html/HtmlReader.cpp b/reader/src/formats/html/HtmlReader.cpp new file mode 100644 index 0000000..a5ce7fa --- /dev/null +++ b/reader/src/formats/html/HtmlReader.cpp @@ -0,0 +1,373 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> +#include <cctype> + +#include <ZLInputStream.h> +#include <ZLXMLReader.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> + +#include "HtmlReader.h" +#include "HtmlEntityCollection.h" + +HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) { +} + +HtmlReader::~HtmlReader() { +} + +void HtmlReader::setTag(HtmlTag &tag, const std::string &name) { + tag.Attributes.clear(); + + if (name.length() == 0) { + tag.Name = name; + return; + } + + tag.Start = name[0] != '/'; + if (tag.Start) { + tag.Name = name; + } else { + tag.Name = name.substr(1); + } + + const std::size_t len = tag.Name.length(); + for (std::size_t i = 0; i < len; ++i) { + tag.Name[i] = std::toupper(tag.Name[i]); + } +} + +enum ParseState { + PS_TEXT, + PS_TAGSTART, + PS_TAGNAME, + PS_WAIT_END_OF_TAG, + PS_ATTRIBUTENAME, + PS_ATTRIBUTEVALUE, + PS_SKIPTAG, + PS_COMMENT, + PS_SPECIAL, + PS_SPECIAL_IN_ATTRIBUTEVALUE, +}; + +enum SpecialType { + ST_UNKNOWN, + ST_NUM, + ST_NAME, + ST_DEC, + ST_HEX +}; + +static bool allowSymbol(SpecialType type, char ch) { + return + (type == ST_NAME && std::isalpha(ch)) || + (type == ST_DEC && std::isdigit(ch)) || + (type == ST_HEX && std::isxdigit(ch)); +} + +static int specialSymbolNumber(SpecialType type, const std::string &txt) { + char *end = 0; + switch (type) { + case ST_NAME: + return HtmlEntityCollection::symbolNumber(txt); + case ST_DEC: + return std::strtol(txt.c_str() + 1, &end, 10); + case ST_HEX: + return std::strtol(txt.c_str() + 2, &end, 16); + default: + return 0; + } +} + +void HtmlReader::appendString(std::string &to, std::string &from) { + if (myConverter.isNull()) { + to += from; + } else { + myConverter->convert(to, from); + myConverter->reset(); + } + from.erase(); +} + +void HtmlReader::readDocument(ZLInputStream &stream) { + if (!stream.open()) { + return; + } + + startDocumentHandler(); + + ParseState state = PS_TEXT; + SpecialType state_special = ST_UNKNOWN; + std::string currentString; + std::string attributeValueString; + std::string specialString; + int quotationCounter = 0; + HtmlTag currentTag; + char endOfComment[2] = "\0"; + + const std::size_t BUFSIZE = 2048; + char *buffer = new char[BUFSIZE]; + std::size_t length; + std::size_t offset = 0; + do { + length = stream.read(buffer, BUFSIZE); + char *start = buffer; + char *endOfBuffer = buffer + length; + for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { + switch (state) { + case PS_TEXT: + if (*ptr == '<') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_TAGSTART; + currentTag.Offset = offset + (ptr - buffer); + } + if (*ptr == '&') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_SPECIAL; + state_special = ST_UNKNOWN; + } + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + if (state_special == ST_UNKNOWN) { + if (*ptr == '#') { + state_special = ST_NUM; + } else if (std::isalpha(*ptr)) { + state_special = ST_NAME; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else if (state_special == ST_NUM) { + if (*ptr == 'x') { + state_special = ST_HEX; + } else if (std::isdigit(*ptr)) { + state_special = ST_DEC; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else { + if (*ptr == ';') { + specialString.append(start, ptr - start); + int number = specialSymbolNumber(state_special, specialString); + if ((128 <= number) && (number <= 159)) { + char ch = number; + if (state == PS_SPECIAL) { + characterDataHandler(&ch, 1, true); + } else { + myConverter->convert(attributeValueString, &ch, &ch + 1); + } + } else if (number != 0) { + char buffer[4]; + int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); + if (state == PS_SPECIAL) { + characterDataHandler(buffer, len, false); + } else { + attributeValueString.append(buffer, len); + } + } else { + specialString = "&" + specialString + ";"; + if (state == PS_SPECIAL) { + characterDataHandler(specialString.c_str(), specialString.length(), false); + } else { + attributeValueString += specialString; + } + } + specialString.erase(); + start = ptr + 1; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } else if (!allowSymbol(state_special, *ptr)) { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } + break; + case PS_TAGSTART: + state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; + break; + case PS_COMMENT: + if ((endOfComment[0] == '\0') && (*ptr != '-')) { + state = PS_TAGNAME; + } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { + start = ptr + 1; + state = PS_TEXT; + endOfComment[0] = '\0'; + endOfComment[1] = '\0'; + } else { + endOfComment[0] = endOfComment[1]; + endOfComment[1] = *ptr; + } + break; + case PS_WAIT_END_OF_TAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + case PS_TAGNAME: + if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { + currentString.append(start, ptr - start); + start = ptr + 1; + setTag(currentTag, currentString); + currentString.erase(); + if (currentTag.Name == "") { + state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; + } else { + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + } + break; + case PS_ATTRIBUTENAME: + if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + for (unsigned int i = 0; i < currentString.length(); ++i) { + currentString[i] = std::toupper(currentString[i]); + } + currentTag.addAttribute(currentString); + currentString.erase(); + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; + } + } + break; + case PS_ATTRIBUTEVALUE: + if (*ptr == '"') { + if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { + ++quotationCounter; + } + } else if (*ptr == '&') { + currentString.append(start, ptr - start); + start = ptr + 1; + appendString(attributeValueString, currentString); + state = PS_SPECIAL_IN_ATTRIBUTEVALUE; + state_special = ST_UNKNOWN; + } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + appendString(attributeValueString, currentString); + if (attributeValueString[0] == '"') { + attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); + } + currentTag.setLastAttributeValue(attributeValueString); + attributeValueString.erase(); + quotationCounter = 0; + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + break; + case PS_SKIPTAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + } + } + if (start != endOfBuffer) { + switch (state) { + case PS_TEXT: + if (!characterDataHandler(start, endOfBuffer - start, true)) { + goto endOfProcessing; + } + break; + case PS_TAGNAME: + case PS_ATTRIBUTENAME: + case PS_ATTRIBUTEVALUE: + currentString.append(start, endOfBuffer - start); + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + specialString.append(start, endOfBuffer - start); + break; + case PS_TAGSTART: + case PS_SKIPTAG: + case PS_COMMENT: + case PS_WAIT_END_OF_TAG: + break; + } + } + offset += length; + } while (length == BUFSIZE); +endOfProcessing: + delete[] buffer; + + endDocumentHandler(); + + stream.close(); +} diff --git a/reader/src/formats/html/HtmlReader.h b/reader/src/formats/html/HtmlReader.h new file mode 100644 index 0000000..876fad8 --- /dev/null +++ b/reader/src/formats/html/HtmlReader.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLREADER_H__ +#define __HTMLREADER_H__ + +#include <string> +#include <vector> + +#include <ZLEncodingConverter.h> +#include "../EncodedTextReader.h" + +class ZLInputStream; + +class HtmlReader : public EncodedTextReader { + +public: + struct HtmlAttribute { + std::string Name; + std::string Value; + bool HasValue; + + HtmlAttribute(const std::string &name); + ~HtmlAttribute(); + void setValue(const std::string &value); + }; + + struct HtmlTag { + std::string Name; + std::size_t Offset; + bool Start; + std::vector<HtmlAttribute> Attributes; + + HtmlTag(); + ~HtmlTag(); + void addAttribute(const std::string &name); + void setLastAttributeValue(const std::string &value); + + private: + HtmlTag(const HtmlTag&); + const HtmlTag &operator = (const HtmlTag&); + }; + +private: + static void setTag(HtmlTag &tag, const std::string &fullName); + +public: + virtual void readDocument(ZLInputStream &stream); + +protected: + HtmlReader(const std::string &encoding); + virtual ~HtmlReader(); + +protected: + virtual void startDocumentHandler() = 0; + virtual void endDocumentHandler() = 0; + + // returns false iff processing must be stopped + virtual bool tagHandler(const HtmlTag &tag) = 0; + // returns false iff processing must be stopped + virtual bool characterDataHandler(const char *text, std::size_t len, bool convert) = 0; + +private: + void appendString(std::string &to, std::string &from); +}; + +inline HtmlReader::HtmlAttribute::HtmlAttribute(const std::string &name) : Name(name), HasValue(false) {} +inline HtmlReader::HtmlAttribute::~HtmlAttribute() {} +inline void HtmlReader::HtmlAttribute::setValue(const std::string &value) { Value = value; HasValue = true; } + +inline HtmlReader::HtmlTag::HtmlTag() : Start(true) {} +inline HtmlReader::HtmlTag::~HtmlTag() {} +inline void HtmlReader::HtmlTag::addAttribute(const std::string &name) { Attributes.push_back(HtmlAttribute(name)); } +inline void HtmlReader::HtmlTag::setLastAttributeValue(const std::string &value) { if (!Attributes.empty()) Attributes.back().setValue(value); } + +#endif /* __HTMLREADER_H__ */ diff --git a/reader/src/formats/html/HtmlReaderStream.cpp b/reader/src/formats/html/HtmlReaderStream.cpp new file mode 100644 index 0000000..08c43ae --- /dev/null +++ b/reader/src/formats/html/HtmlReaderStream.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cstring> +#include <algorithm> + +#include "HtmlReaderStream.h" +#include "HtmlReader.h" + +class HtmlTextOnlyReader : public HtmlReader { + +public: + HtmlTextOnlyReader(char *buffer, std::size_t maxSize); + std::size_t size() const; + +private: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + char *myBuffer; + std::size_t myMaxSize; + std::size_t myFilledSize; + bool myIgnoreText; +}; + +HtmlTextOnlyReader::HtmlTextOnlyReader(char *buffer, std::size_t maxSize) : HtmlReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0), myIgnoreText(false) { +} + +std::size_t HtmlTextOnlyReader::size() const { + return myFilledSize; +} + +void HtmlTextOnlyReader::startDocumentHandler() { +} + +void HtmlTextOnlyReader::endDocumentHandler() { +} + +bool HtmlTextOnlyReader::tagHandler(const HtmlTag &tag) { + if (tag.Name == "SCRIPT") { + myIgnoreText = tag.Start; + } + if ((myFilledSize < myMaxSize) && (myFilledSize > 0) && (myBuffer[myFilledSize - 1] != '\n')) { + myBuffer[myFilledSize++] = '\n'; + } + return myFilledSize < myMaxSize; +} + +bool HtmlTextOnlyReader::characterDataHandler(const char *text, std::size_t len, bool) { + if (!myIgnoreText) { + len = std::min((std::size_t)len, myMaxSize - myFilledSize); + std::memcpy(myBuffer + myFilledSize, text, len); + myFilledSize += len; + } + return myFilledSize < myMaxSize; +} + +HtmlReaderStream::HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize) : myBase(base), myBuffer(0), mySize(maxSize) { +} + +HtmlReaderStream::~HtmlReaderStream() { + close(); +} + +bool HtmlReaderStream::open() { + if (myBase.isNull() || !myBase->open()) { + return false; + } + myBuffer = new char[mySize]; + HtmlTextOnlyReader reader(myBuffer, mySize); + reader.readDocument(*myBase); + mySize = reader.size(); + myOffset = 0; + myBase->close(); + return true; +} + +std::size_t HtmlReaderStream::read(char *buffer, std::size_t maxSize) { + maxSize = std::min(maxSize, mySize - myOffset); + if (buffer != 0) { + std::memcpy(buffer, myBuffer, maxSize); + } + myOffset += maxSize; + return maxSize; +} + +void HtmlReaderStream::close() { + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void HtmlReaderStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min(mySize, (std::size_t)std::max(0, offset)); +} + +std::size_t HtmlReaderStream::offset() const { + return myOffset; +} + +std::size_t HtmlReaderStream::sizeOfOpened() { + return mySize; +} diff --git a/reader/src/formats/html/HtmlReaderStream.h b/reader/src/formats/html/HtmlReaderStream.h new file mode 100644 index 0000000..c5c15b8 --- /dev/null +++ b/reader/src/formats/html/HtmlReaderStream.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLREADERSTREAM_H__ +#define __HTMLREADERSTREAM_H__ + +#include <shared_ptr.h> +#include <ZLInputStream.h> + +class HtmlReaderStream : public ZLInputStream { + +public: + HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize); + ~HtmlReaderStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + shared_ptr<ZLInputStream> myBase; + char *myBuffer; + std::size_t mySize; + std::size_t myOffset; +}; + +#endif /* __HTMLREADERSTREAM_H__ */ diff --git a/reader/src/formats/html/HtmlTagActions.h b/reader/src/formats/html/HtmlTagActions.h new file mode 100644 index 0000000..7da3f20 --- /dev/null +++ b/reader/src/formats/html/HtmlTagActions.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLTAGACTIONS_H__ +#define __HTMLTAGACTIONS_H__ + +#include <set> + +#include "HtmlBookReader.h" + +class HtmlTagAction { + +protected: + HtmlTagAction(HtmlBookReader &reader); + +public: + virtual ~HtmlTagAction(); + virtual void run(const HtmlReader::HtmlTag &tag) = 0; + virtual void reset(); + +protected: + BookReader &bookReader(); + +protected: + HtmlBookReader &myReader; +}; + +class DummyHtmlTagAction : public HtmlTagAction { + +public: + DummyHtmlTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlControlTagAction : public HtmlTagAction { + +public: + HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind); + void run(const HtmlReader::HtmlTag &tag); + +private: + FBTextKind myKind; +}; + +class HtmlHeaderTagAction : public HtmlTagAction { + +public: + HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind); + void run(const HtmlReader::HtmlTag &tag); + +private: + FBTextKind myKind; +}; + +class HtmlIgnoreTagAction : public HtmlTagAction { + +public: + HtmlIgnoreTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); + +private: + std::set<std::string> myTagNames; +}; + +class HtmlHrefTagAction : public HtmlTagAction { + +public: + HtmlHrefTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); + void reset(); + +protected: + FBTextKind hyperlinkType() const; + void setHyperlinkType(FBTextKind hyperlinkType); + +private: + FBTextKind myHyperlinkType; +}; + +class HtmlImageTagAction : public HtmlTagAction { + +public: + HtmlImageTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlBreakTagAction : public HtmlTagAction { + +public: + enum BreakType { + BREAK_AT_START = 1, + BREAK_AT_END = 2, + BREAK_AT_START_AND_AT_END = BREAK_AT_START | BREAK_AT_END + }; + HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType); + void run(const HtmlReader::HtmlTag &tag); + +private: + BreakType myBreakType; +}; + +class HtmlPreTagAction : public HtmlTagAction { + +public: + HtmlPreTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlListTagAction : public HtmlTagAction { + +public: + HtmlListTagAction(HtmlBookReader &reader, int startIndex); + void run(const HtmlReader::HtmlTag &tag); + +private: + int myStartIndex; +}; + +class HtmlListItemTagAction : public HtmlTagAction { + +public: + HtmlListItemTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlTableTagAction : public HtmlTagAction { + +public: + HtmlTableTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlStyleTagAction : public HtmlTagAction { + +public: + HtmlStyleTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +inline BookReader &HtmlTagAction::bookReader() { return myReader.myBookReader; } + +#endif /* __HTMLTAGACTIONS_H__ */ |