diff options
Diffstat (limited to 'fbreader/src/formats/rtf/RtfReader.cpp')
-rw-r--r-- | fbreader/src/formats/rtf/RtfReader.cpp | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/fbreader/src/formats/rtf/RtfReader.cpp b/fbreader/src/formats/rtf/RtfReader.cpp new file mode 100644 index 0000000..91fea0c --- /dev/null +++ b/fbreader/src/formats/rtf/RtfReader.cpp @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cctype> + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "RtfReader.h" + +std::map<std::string, RtfCommand*> RtfReader::ourKeywordMap; + +static const int rtfStreamBufferSize = 4096; + +RtfReader::RtfReader(const std::string &encoding) : EncodedTextReader(encoding) { + myNextImageMimeType = ZLMimeType::EMPTY; +} + +RtfReader::~RtfReader() { +} + +RtfCommand::~RtfCommand() { +} + +void RtfDummyCommand::run(RtfReader&, int*) const { +} + +void RtfNewParagraphCommand::run(RtfReader &reader, int*) const { + reader.newParagraph(); +} + +RtfFontPropertyCommand::RtfFontPropertyCommand(RtfReader::FontProperty property) : myProperty(property) { +} + +void RtfFontPropertyCommand::run(RtfReader &reader, int *parameter) const { + const bool start = (parameter == 0) || (*parameter != 0); + switch (myProperty) { + case RtfReader::FONT_BOLD: + if (reader.myState.Bold != start) { + reader.myState.Bold = start; + reader.setFontProperty(RtfReader::FONT_BOLD); + } + break; + case RtfReader::FONT_ITALIC: + if (reader.myState.Italic != start) { + reader.myState.Italic = start; + reader.setFontProperty(RtfReader::FONT_ITALIC); + } + break; + case RtfReader::FONT_UNDERLINED: + if (reader.myState.Underlined != start) { + reader.myState.Underlined = start; + reader.setFontProperty(RtfReader::FONT_UNDERLINED); + } + break; + } +} + +RtfAlignmentCommand::RtfAlignmentCommand(ZLTextAlignmentType alignment) : myAlignment(alignment) { +} + +void RtfAlignmentCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Alignment != myAlignment) { + reader.myState.Alignment = myAlignment; + reader.setAlignment(); + } +} + +RtfCharCommand::RtfCharCommand(const std::string &chr) : myChar(chr) { +} + +void RtfCharCommand::run(RtfReader &reader, int*) const { + reader.processCharData(myChar.data(), myChar.length(), false); +} + +RtfDestinationCommand::RtfDestinationCommand(RtfReader::DestinationType destination) : myDestination(destination) { +} + +void RtfDestinationCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Destination == myDestination) { + return; + } + reader.myState.Destination = myDestination; + if (myDestination == RtfReader::DESTINATION_PICTURE) { + reader.myState.ReadDataAsHex = true; + reader.myNextImageMimeType = ZLMimeType::EMPTY; + } + reader.switchDestination(myDestination, true); +} + +void RtfStyleCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Destination == RtfReader::DESTINATION_STYLESHEET) { + //std::cerr << "Add style index: " << val << "\n"; + + //sprintf(style_attributes[0], "%i", val); + } else /*if (myState.Destination == rdsContent)*/ { + //std::cerr << "Set style index: " << val << "\n"; + + //sprintf(style_attributes[0], "%i", val); + } +} + +void RtfCodepageCommand::run(RtfReader &reader, int *parameter) const { + if (parameter != 0) { + reader.setEncoding(*parameter); + } +} + +void RtfSpecialCommand::run(RtfReader &reader, int*) const { + reader.mySpecialMode = true; +} + +RtfPictureCommand::RtfPictureCommand(shared_ptr<ZLMimeType> mimeType) : myMimeType(mimeType) { +} + +void RtfPictureCommand::run(RtfReader &reader, int*) const { + reader.myNextImageMimeType = myMimeType; +} + +void RtfFontResetCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Bold) { + reader.myState.Bold = false; + reader.setFontProperty(RtfReader::FONT_BOLD); + } + if (reader.myState.Italic) { + reader.myState.Italic = false; + reader.setFontProperty(RtfReader::FONT_ITALIC); + } + if (reader.myState.Underlined) { + reader.myState.Underlined = false; + reader.setFontProperty(RtfReader::FONT_UNDERLINED); + } +} + +void RtfReader::addAction(const std::string &tag, RtfCommand *command) { + ourKeywordMap.insert(std::make_pair(tag, command)); +} + +void RtfReader::fillKeywordMap() { + if (ourKeywordMap.empty()) { + addAction("*", new RtfSpecialCommand()); + addAction("ansicpg", new RtfCodepageCommand()); + + static const char *keywordsToSkip[] = {"buptim", "colortbl", "comment", "creatim", "doccomm", "fonttbl", "footer", "footerf", "footerl", "footerr", "ftncn", "ftnsep", "ftnsepc", "header", "headerf", "headerl", "headerr", "keywords", "operator", "printim", "private1", "revtim", "rxe", "subject", "tc", "txe", "xe", 0}; + RtfCommand *skipCommand = new RtfDestinationCommand(RtfReader::DESTINATION_SKIP); + for (const char **i = keywordsToSkip; *i != 0; ++i) { + addAction(*i, skipCommand); + } + addAction("shppict", new RtfDummyCommand()); + addAction("info", new RtfDestinationCommand(RtfReader::DESTINATION_INFO)); + addAction("title", new RtfDestinationCommand(RtfReader::DESTINATION_TITLE)); + addAction("author", new RtfDestinationCommand(RtfReader::DESTINATION_AUTHOR)); + addAction("pict", new RtfDestinationCommand(RtfReader::DESTINATION_PICTURE)); + addAction("stylesheet", new RtfDestinationCommand(RtfReader::DESTINATION_STYLESHEET)); + addAction("footnote", new RtfDestinationCommand(RtfReader::DESTINATION_FOOTNOTE)); + + RtfCommand *newParagraphCommand = new RtfNewParagraphCommand(); + addAction("\n", newParagraphCommand); + addAction("\r", newParagraphCommand); + addAction("par", newParagraphCommand); + + addAction("\x09", new RtfCharCommand("\x09")); + addAction("_", new RtfCharCommand("-")); + addAction("\\", new RtfCharCommand("\\")); + addAction("{", new RtfCharCommand("{")); + addAction("}", new RtfCharCommand("}")); + addAction("bullet", new RtfCharCommand("\xE2\x80\xA2")); // • + addAction("endash", new RtfCharCommand("\xE2\x80\x93")); // – + addAction("emdash", new RtfCharCommand("\xE2\x80\x94")); // — + addAction("~", new RtfCharCommand("\xC0\xA0")); // + addAction("enspace", new RtfCharCommand("\xE2\x80\x82")); //   + addAction("emspace", new RtfCharCommand("\xE2\x80\x83")); //   + addAction("lquote", new RtfCharCommand("\xE2\x80\x98")); // ‘ + addAction("rquote", new RtfCharCommand("\xE2\x80\x99")); // ’ + addAction("ldblquote", new RtfCharCommand("\xE2\x80\x9C")); // “ + addAction("rdblquote", new RtfCharCommand("\xE2\x80\x9D")); // ” + + addAction("jpegblip", new RtfPictureCommand(ZLMimeType::IMAGE_JPEG)); + addAction("pngblip", new RtfPictureCommand(ZLMimeType::IMAGE_PNG)); + + addAction("s", new RtfStyleCommand()); + + addAction("qc", new RtfAlignmentCommand(ALIGN_CENTER)); + addAction("ql", new RtfAlignmentCommand(ALIGN_LEFT)); + addAction("qr", new RtfAlignmentCommand(ALIGN_RIGHT)); + addAction("qj", new RtfAlignmentCommand(ALIGN_JUSTIFY)); + addAction("pard", new RtfAlignmentCommand(ALIGN_UNDEFINED)); + + addAction("b", new RtfFontPropertyCommand(RtfReader::FONT_BOLD)); + addAction("i", new RtfFontPropertyCommand(RtfReader::FONT_ITALIC)); + addAction("u", new RtfFontPropertyCommand(RtfReader::FONT_UNDERLINED)); + addAction("plain", new RtfFontResetCommand()); + } +} + +bool RtfReader::parseDocument() { + enum { + READ_NORMAL_DATA, + READ_BINARY_DATA, + READ_HEX_SYMBOL, + READ_KEYWORD, + READ_KEYWORD_PARAMETER, + READ_END_OF_FILE + } parserState = READ_NORMAL_DATA; + + std::string keyword; + std::string parameterString; + std::string hexString; + int imageStartOffset = -1; + + while (!myIsInterrupted) { + const char *ptr = myStreamBuffer; + const char *end = myStreamBuffer + myStream->read(myStreamBuffer, rtfStreamBufferSize); + if (ptr == end) { + break; + } + const char *dataStart = ptr; + bool readNextChar = true; + while (ptr != end) { + switch (parserState) { + case READ_END_OF_FILE: + if (*ptr != '}' && !std::isspace(*ptr)) { + return false; + } + break; + case READ_BINARY_DATA: + // TODO: optimize + processCharData(ptr, 1); + --myBinaryDataSize; + if (myBinaryDataSize == 0) { + parserState = READ_NORMAL_DATA; + } + break; + case READ_NORMAL_DATA: + switch (*ptr) { + case '{': + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + myStateStack.push(myState); + myState.ReadDataAsHex = false; + break; + case '}': + { + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + + if (imageStartOffset >= 0) { + if (ZLMimeType::EMPTY != myNextImageMimeType) { + const int imageSize = myStream->offset() + (ptr - end) - imageStartOffset; + insertImage(myNextImageMimeType, myFileName, imageStartOffset, imageSize); + } + imageStartOffset = -1; + } + + if (myStateStack.empty()) { + parserState = READ_END_OF_FILE; + break; + } + + if (myState.Destination != myStateStack.top().Destination) { + switchDestination(myState.Destination, false); + switchDestination(myStateStack.top().Destination, true); + } + + bool oldItalic = myState.Italic; + bool oldBold = myState.Bold; + bool oldUnderlined = myState.Underlined; + ZLTextAlignmentType oldAlignment = myState.Alignment; + myState = myStateStack.top(); + myStateStack.pop(); + + if (myState.Italic != oldItalic) { + setFontProperty(RtfReader::FONT_ITALIC); + } + if (myState.Bold != oldBold) { + setFontProperty(RtfReader::FONT_BOLD); + } + if (myState.Underlined != oldUnderlined) { + setFontProperty(RtfReader::FONT_UNDERLINED); + } + if (myState.Alignment != oldAlignment) { + setAlignment(); + } + + break; + } + case '\\': + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + keyword.erase(); + parserState = READ_KEYWORD; + break; + case 0x0d: + case 0x0a: // cr and lf are noise characters... + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + break; + default: + if (myState.ReadDataAsHex) { + if (imageStartOffset == -1) { + imageStartOffset = myStream->offset() + (ptr - end); + } + } + break; + } + break; + case READ_HEX_SYMBOL: + hexString += *ptr; + if (hexString.size() == 2) { + char ch = std::strtol(hexString.c_str(), 0, 16); + hexString.erase(); + processCharData(&ch, 1); + parserState = READ_NORMAL_DATA; + dataStart = ptr + 1; + } + break; + case READ_KEYWORD: + if (!std::isalpha(*ptr)) { + if ((ptr == dataStart) && (keyword.empty())) { + if (*ptr == '\'') { + parserState = READ_HEX_SYMBOL; + } else { + keyword = *ptr; + processKeyword(keyword); + parserState = READ_NORMAL_DATA; + } + dataStart = ptr + 1; + } else { + keyword.append(dataStart, ptr - dataStart); + if (*ptr == '-' || std::isdigit(*ptr)) { + dataStart = ptr; + parserState = READ_KEYWORD_PARAMETER; + } else { + readNextChar = *ptr == ' '; + processKeyword(keyword); + parserState = READ_NORMAL_DATA; + dataStart = readNextChar ? ptr + 1 : ptr; + } + } + } + break; + case READ_KEYWORD_PARAMETER: + if (!std::isdigit(*ptr)) { + parameterString.append(dataStart, ptr - dataStart); + int parameter = std::atoi(parameterString.c_str()); + parameterString.erase(); + readNextChar = *ptr == ' '; + if ((keyword == "bin") && (parameter > 0)) { + myBinaryDataSize = parameter; + parserState = READ_BINARY_DATA; + } else { + processKeyword(keyword, ¶meter); + parserState = READ_NORMAL_DATA; + } + dataStart = readNextChar ? ptr + 1 : ptr; + } + break; + } + if (readNextChar) { + ++ptr; + } else { + readNextChar = true; + } + } + if (dataStart < end) { + switch (parserState) { + case READ_NORMAL_DATA: + processCharData(dataStart, end - dataStart); + case READ_KEYWORD: + keyword.append(dataStart, end - dataStart); + break; + case READ_KEYWORD_PARAMETER: + parameterString.append(dataStart, end - dataStart); + break; + default: + break; + } + } + } + + return myIsInterrupted || myStateStack.empty(); +} + +void RtfReader::processKeyword(const std::string &keyword, int *parameter) { + const bool wasSpecialMode = mySpecialMode; + mySpecialMode = false; + if (myState.Destination == RtfReader::DESTINATION_SKIP) { + return; + } + + std::map<std::string, RtfCommand*>::const_iterator it = ourKeywordMap.find(keyword); + + if (it == ourKeywordMap.end()) { + if (wasSpecialMode) { + myState.Destination = RtfReader::DESTINATION_SKIP; + } + return; + } + + it->second->run(*this, parameter); +} + +void RtfReader::processCharData(const char *data, std::size_t len, bool convert) { + if (myState.Destination != RtfReader::DESTINATION_SKIP) { + addCharData(data, len, convert); + } +} + +void RtfReader::interrupt() { + myIsInterrupted = true; +} + +bool RtfReader::readDocument(const ZLFile &file) { + myFileName = file.path(); + myStream = file.inputStream(); + if (myStream.isNull() || !myStream->open()) { + return false; + } + + fillKeywordMap(); + + myStreamBuffer = new char[rtfStreamBufferSize]; + + myIsInterrupted = false; + + mySpecialMode = false; + + myState.Alignment = ALIGN_UNDEFINED; + myState.Italic = false; + myState.Bold = false; + myState.Underlined = false; + myState.Destination = RtfReader::DESTINATION_NONE; + myState.ReadDataAsHex = false; + + bool code = parseDocument(); + + while (!myStateStack.empty()) { + myStateStack.pop(); + } + + delete[] myStreamBuffer; + myStream->close(); + + return code; +} |