diff options
Diffstat (limited to 'reader/src/formats/doc/DocBookReader.cpp')
-rw-r--r-- | reader/src/formats/doc/DocBookReader.cpp | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/reader/src/formats/doc/DocBookReader.cpp b/reader/src/formats/doc/DocBookReader.cpp new file mode 100644 index 0000000..99f471a --- /dev/null +++ b/reader/src/formats/doc/DocBookReader.cpp @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <vector> +#include <string> + +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLFileImage.h> + +#include "DocBookReader.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +#include "OleStorage.h" +#include "OleMainStream.h" + +DocBookReader::DocBookReader(BookModel &model, const std::string &encoding) : + myModelReader(model), + myPictureCounter(0), + myEncoding(encoding) { + myReadState = READ_TEXT; +} + +bool DocBookReader::readBook() { + const ZLFile &file = myModelReader.model().book()->file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + myModelReader.beginParagraph(); + + if (!readDocument(stream, true)) { + return false; + } + + myModelReader.insertEndOfTextParagraph(); + return true; +} + +void DocBookReader::handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_INFO) { + myFieldInfoBuffer.push_back(ucs2char); + return; + } + if (myReadState == READ_FIELD && myReadFieldState == DONT_READ_FIELD_TEXT) { + return; + } + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && ucs2char == WORD_HORIZONTAL_TAB) { + //to remove pagination from TOC (from doc saved in OpenOffice) + myReadFieldState = DONT_READ_FIELD_TEXT; + return; + } + std::string utf8String; + ZLUnicodeUtil::Ucs2String ucs2String; + ucs2String.push_back(ucs2char); + ZLUnicodeUtil::ucs2ToUtf8(utf8String, ucs2String); + if (!myModelReader.paragraphIsOpen()) { + myModelReader.beginParagraph(); + } + myModelReader.addData(utf8String); +} + +void DocBookReader::handleHardLinebreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + if (!myCurrentStyleEntry.isNull()) { + myModelReader.addStyleEntry(*myCurrentStyleEntry); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphEnd() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + myCurrentStyleEntry = 0; +} + +void DocBookReader::handlePageBreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myCurrentStyleEntry = 0; + myModelReader.insertEndOfSectionParagraph(); + myModelReader.beginParagraph(); +} + +void DocBookReader::handleTableSeparator() { + handleChar(SPACE); + handleChar(VERTICAL_LINE); + handleChar(SPACE); +} + +void DocBookReader::handleTableEndRow() { + handleParagraphEnd(); +} + +void DocBookReader::handleFootNoteMark() { + //TODO implement +} + +void DocBookReader::handleStartField() { + if (myReadState == READ_FIELD) { //for nested fields + handleEndField(); + } + myReadState = READ_FIELD; + myReadFieldState = READ_FIELD_INFO; + myHyperlinkTypeState = NO_HYPERLINK; +} + +void DocBookReader::handleSeparatorField() { + static const std::string HYPERLINK = "HYPERLINK"; + static const std::string SEQUENCE = "SEQ"; +// static const std::string PAGE = "PAGE"; +// static const std::string PAGEREF = "PAGEREF"; +// static const std::string SHAPE = "SHAPE"; + static const std::string SPACE_DELIMETER = " "; + static const std::string LOCAL_LINK = "\\l"; + static const std::string QUOTE = "\""; + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + ZLUnicodeUtil::Ucs2String buffer = myFieldInfoBuffer; + myFieldInfoBuffer.clear(); + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, buffer); + ZLUnicodeUtil::utf8Trim(utf8String); + if (utf8String.empty()) { + return; + } + std::vector<std::string> result = ZLStringUtil::split(utf8String, SPACE_DELIMETER); + //TODO split function can returns empty string, maybe fix it + std::vector<std::string> splitted; + for (std::size_t i = 0; i < result.size(); ++i) { + if (!result.at(i).empty()) { + splitted.push_back(result.at(i)); + } + } + + if (!splitted.empty() && splitted.at(0) == SEQUENCE) { + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + return; + } + + if (splitted.size() < 2 || splitted.at(0) != HYPERLINK) { + myReadFieldState = DONT_READ_FIELD_TEXT; + //to remove pagination from TOC and not hyperlink fields + return; + } + + if (splitted.at(1) == LOCAL_LINK) { + std::string link = parseLink(buffer); + if (!link.empty()) { + myModelReader.addHyperlinkControl(INTERNAL_HYPERLINK, link); + myHyperlinkTypeState = INT_HYPERLINK_INSERTED; + } + } else { + std::string link = parseLink(buffer, true); + if (!link.empty()) { + myModelReader.addHyperlinkControl(EXTERNAL_HYPERLINK, link); + myHyperlinkTypeState = EXT_HYPERLINK_INSERTED; + } + } +} + +void DocBookReader::handleEndField() { + myFieldInfoBuffer.clear(); + if (myReadState == READ_TEXT) { + return; + } + if (myHyperlinkTypeState == EXT_HYPERLINK_INSERTED) { + myModelReader.addControl(EXTERNAL_HYPERLINK, false); + } else if (myHyperlinkTypeState == INT_HYPERLINK_INSERTED) { + myModelReader.addControl(INTERNAL_HYPERLINK, false); + } + myReadState = READ_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + +} + +void DocBookReader::handleImage(const ZLFileImage::Blocks &blocks) { + std::string number; + ZLStringUtil::appendNumber(number, myPictureCounter++); + myModelReader.addImageReference(number); + ZLFile file(myModelReader.model().book()->file().path(), ZLMimeType::IMAGE_AUTO); + myModelReader.addImage(number, new ZLFileImage(file, blocks, ZLFileImage::ENCODING_NONE)); +} + +void DocBookReader::handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (ucs2char == WORD_MINUS) { + handleChar(MINUS); + } else if (ucs2char == WORD_SOFT_HYPHEN) { + //skip + } else if (ucs2char == WORD_HORIZONTAL_TAB) { + handleChar(ucs2char); + } else { +// myTextBuffer.clear(); + } +} + +void DocBookReader::handleFontStyle(unsigned int fontStyle) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && myHyperlinkTypeState != NO_HYPERLINK) { + //to fix bug with hyperlink, that's only bold and doesn't looks like hyperlink + return; + } + while (!myKindStack.empty()) { + myModelReader.addControl(myKindStack.back(), false); + myKindStack.pop_back(); + } + if (fontStyle & OleMainStream::CharInfo::FONT_BOLD) { + myKindStack.push_back(BOLD); + } + if (fontStyle & OleMainStream::CharInfo::FONT_ITALIC) { + myKindStack.push_back(ITALIC); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphStyle(const OleMainStream::Style &styleInfo) { + if (styleInfo.HasPageBreakBefore) { + handlePageBreak(); + } + shared_ptr<ZLTextStyleEntry> entry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + + switch (styleInfo.Alignment) { + default: // in that case, use default alignment type + break; + case OleMainStream::Style::ALIGNMENT_LEFT: + entry->setAlignmentType(ALIGN_LEFT); + break; + case OleMainStream::Style::ALIGNMENT_RIGHT: + entry->setAlignmentType(ALIGN_RIGHT); + break; + case OleMainStream::Style::ALIGNMENT_CENTER: + entry->setAlignmentType(ALIGN_CENTER); + break; + case OleMainStream::Style::ALIGNMENT_JUSTIFY: + entry->setAlignmentType(ALIGN_JUSTIFY); + break; + } + + //TODO in case, where style is heading, but size is small it works wrong + const ZLTextStyleEntry::SizeUnit unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT; + switch (styleInfo.StyleIdCurrent) { + default: + break; + case OleMainStream::Style::STYLE_H1: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 140, unit); + break; + case OleMainStream::Style::STYLE_H2: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 120, unit); + break; + case OleMainStream::Style::STYLE_H3: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 110, unit); + break; + } + myCurrentStyleEntry = entry; + myModelReader.addStyleEntry(*myCurrentStyleEntry); + + // we should have the same font style, as for the previous paragraph, + // if it has the same StyleIdCurrent + if (myCurrentStyleInfo.StyleIdCurrent != OleMainStream::Style::STYLE_INVALID && + myCurrentStyleInfo.StyleIdCurrent == styleInfo.StyleIdCurrent) { + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } + } else { + myKindStack.clear(); + // fill by the fontstyle, that was got from Stylesheet + handleFontStyle(styleInfo.CurrentCharInfo.FontStyle); + } + myCurrentStyleInfo = styleInfo; +} + +void DocBookReader::handleBookmark(const std::string &name) { + myModelReader.addHyperlinkLabel(name); +} + +std::string DocBookReader::parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode) { + //TODO add support for HYPERLINK like that: + // [0x13] HYPERLINK "http://site.ru/some text" \t "_blank" [0x14] text [0x15] + //Current implementation search for last QUOTE, so, it reads \t and _blank as part of link + //Last quote searching is need to handle link like that: + // [0x13] HYPERLINK "http://yandex.ru/yandsearch?text='some text' и "some text2"" [0x14] link text [0x15] + + static const ZLUnicodeUtil::Ucs2Char QUOTE = 0x22; + std::size_t i, first = 0; + //TODO maybe functions findFirstOf and findLastOf should be in ZLUnicodeUtil class + for (i = 0; i < s.size(); ++i) { + if (s.at(i) == QUOTE) { + first = i; + break; + } + } + if (i == s.size()) { + return std::string(); + } + std::size_t j, last = 0; + for (j = s.size(); j > 0 ; --j) { + if (s.at(j - 1) == QUOTE) { + last = j - 1; + break; + } + } + if (j == 0 || last == first) { + return std::string(); + } + + ZLUnicodeUtil::Ucs2String link; + for (std::size_t k = first + 1; k < last; ++k) { + ZLUnicodeUtil::Ucs2Char ch = s.at(k); + if (urlencode && ZLUnicodeUtil::isSpace(ch)) { + //TODO maybe implement function for encoding all signs in url, not only spaces and quotes + //TODO maybe add backslash support + link.push_back('%'); + link.push_back('2'); + link.push_back('0'); + } else if (urlencode && ch == QUOTE) { + link.push_back('%'); + link.push_back('2'); + link.push_back('2'); + } else { + link.push_back(ch); + } + } + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, link); + return utf8String; +} + +void DocBookReader::footnotesStartHandler() { + handlePageBreak(); +} + +void DocBookReader::ansiDataHandler(const char *buffer, std::size_t len) { + if (myConverter.isNull()) { + // lazy converter initialization + ZLEncodingCollection &collection = ZLEncodingCollection::Instance(); + ZLEncodingConverterInfoPtr info = collection.info(myEncoding); + myConverter = info.isNull() ? collection.defaultConverter() : info->createConverter(); + } + std::string utf8String; + myConverter->convert(utf8String, buffer, buffer + len); + ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String); +} + +void DocBookReader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) { + myBuffer.push_back(symbol); +} |