summaryrefslogtreecommitdiffstats
path: root/fbreader/src/formats/doc/OleMainStream.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'fbreader/src/formats/doc/OleMainStream.cpp')
-rw-r--r--fbreader/src/formats/doc/OleMainStream.cpp1085
1 files changed, 1085 insertions, 0 deletions
diff --git a/fbreader/src/formats/doc/OleMainStream.cpp b/fbreader/src/formats/doc/OleMainStream.cpp
new file mode 100644
index 0000000..fe829e6
--- /dev/null
+++ b/fbreader/src/formats/doc/OleMainStream.cpp
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (C) 2004-2012 Geometer Plus <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <string>
+
+#include <ZLLogger.h>
+#include <ZLUnicodeUtil.h>
+
+#include "OleUtil.h"
+#include "OleStorage.h"
+
+#include "DocInlineImageReader.h"
+
+#include "OleMainStream.h"
+
+OleMainStream::Style::Style() :
+ StyleIdCurrent(STYLE_INVALID),
+ StyleIdNext(STYLE_INVALID),
+ HasPageBreakBefore(false),
+ BeforeParagraphIndent(0),
+ AfterParagraphIndent(0),
+ LeftIndent(0),
+ FirstLineIndent(0),
+ RightIndent(0),
+ Alignment(ALIGNMENT_DEFAULT) {
+}
+
+OleMainStream::CharInfo::CharInfo() : FontStyle(FONT_REGULAR), FontSize(20) {
+}
+
+OleMainStream::SectionInfo::SectionInfo() : CharPosition(0), IsNewPage(true) {
+}
+
+OleMainStream::InlineImageInfo::InlineImageInfo() : DataPosition(0) {
+}
+
+OleMainStream::FloatImageInfo::FloatImageInfo() : ShapeId(0) {
+}
+
+OleMainStream::OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) : OleStream(storage, oleEntry, stream) {
+}
+
+bool OleMainStream::open(bool doReadFormattingData) {
+ if (OleStream::open() == false) {
+ return false;
+ }
+
+ static const std::size_t HEADER_SIZE = 768; //size of data in header of main stream
+ char headerBuffer[HEADER_SIZE];
+ seek(0, true);
+
+ if (read(headerBuffer, HEADER_SIZE) != HEADER_SIZE) {
+ return false;
+ }
+
+ bool result = readFIB(headerBuffer);
+ if (!result) {
+ return false;
+ }
+
+ // determining table stream number
+ unsigned int tableNumber = (OleUtil::getU2Bytes(headerBuffer, 0xA) & 0x0200) ? 1 : 0;
+ std::string tableName = tableNumber == 0 ? "0" : "1";
+ tableName += "Table";
+ OleEntry tableEntry;
+ result = myStorage->getEntryByName(tableName, tableEntry);
+
+ if (!result) {
+ // cant't find table stream (that can be only in case if file format is below Word 7/8), so building simple table stream
+ // TODO: CHECK may be not all old documents have ANSI
+ ZLLogger::Instance().println("DocPlugin", "cant't find table stream, building own simple piece table, that includes all charachters");
+ Piece piece = {myStartOfText, myEndOfText - myStartOfText, true, Piece::PIECE_TEXT, 0};
+ myPieces.push_back(piece);
+ return true;
+ }
+
+ result = readPieceTable(headerBuffer, tableEntry);
+
+ if (!result) {
+ ZLLogger::Instance().println("DocPlugin", "error during reading piece table");
+ return false;
+ }
+
+ if (!doReadFormattingData) {
+ return true;
+ }
+
+ OleEntry dataEntry;
+ if (myStorage->getEntryByName("Data", dataEntry)) {
+ myDataStream = new OleStream(myStorage, dataEntry, myBaseStream);
+ }
+
+ //result of reading following structures doesn't check, because all these
+ //problems can be ignored, and document can be showed anyway, maybe with wrong formatting
+ readBookmarks(headerBuffer, tableEntry);
+ readStylesheet(headerBuffer, tableEntry);
+ //readSectionsInfoTable(headerBuffer, tableEntry); //it isn't used now
+ readParagraphStyleTable(headerBuffer, tableEntry);
+ readCharInfoTable(headerBuffer, tableEntry);
+ readFloatingImages(headerBuffer, tableEntry);
+ return true;
+}
+
+const OleMainStream::Pieces &OleMainStream::getPieces() const {
+ return myPieces;
+}
+
+const OleMainStream::CharInfoList &OleMainStream::getCharInfoList() const {
+ return myCharInfoList;
+}
+
+const OleMainStream::StyleInfoList &OleMainStream::getStyleInfoList() const {
+ return myStyleInfoList;
+}
+
+const OleMainStream::BookmarksList &OleMainStream::getBookmarks() const {
+ return myBookmarks;
+}
+
+const OleMainStream::InlineImageInfoList &OleMainStream::getInlineImageInfoList() const {
+ return myInlineImageInfoList;
+}
+
+const OleMainStream::FloatImageInfoList &OleMainStream::getFloatImageInfoList() const {
+ return myFloatImageInfoList;
+}
+
+ZLFileImage::Blocks OleMainStream::getFloatImage(unsigned int shapeId) const {
+ if (myFLoatImageReader.isNull()) {
+ return ZLFileImage::Blocks();
+ }
+ return myFLoatImageReader->getBlocksForShapeId(shapeId);
+}
+
+ZLFileImage::Blocks OleMainStream::getInlineImage(unsigned int dataPosition) const {
+ if (myDataStream.isNull()) {
+ return ZLFileImage::Blocks();
+ }
+ DocInlineImageReader imageReader(myDataStream);
+ return imageReader.getImagePieceInfo(dataPosition);
+}
+
+bool OleMainStream::readFIB(const char *headerBuffer) {
+ int flags = OleUtil::getU2Bytes(headerBuffer, 0xA); //offset for flags
+
+ if (flags & 0x0004) { //flag for complex format
+ ZLLogger::Instance().println("DocPlugin", "This was fast-saved. Some information is lost");
+ //lostInfo = (flags & 0xF0) >> 4);
+ }
+
+ if (flags & 0x1000) { //flag for using extending charset
+ ZLLogger::Instance().println("DocPlugin", "File uses extended character set (get_word8_char)");
+ } else {
+ ZLLogger::Instance().println("DocPlugin", "File uses get_8bit_char character set");
+ }
+
+ if (flags & 0x100) { //flag for encrypted files
+ ZLLogger::Instance().println("DocPlugin", "File is encrypted");
+ // Encryption key = %08lx ; NumUtil::get4Bytes(header, 14)
+ return false;
+ }
+
+ unsigned int charset = OleUtil::getU2Bytes(headerBuffer, 0x14); //offset for charset number
+ if (charset && charset != 0x100) { //0x100 = default charset
+ ZLLogger::Instance().println("DocPlugin", "Using not default character set %d");
+ } else {
+ ZLLogger::Instance().println("DocPlugin", "Using default character set");
+ }
+
+ myStartOfText = OleUtil::get4Bytes(headerBuffer, 0x18); //offset for start of text value
+ myEndOfText = OleUtil::get4Bytes(headerBuffer, 0x1c); //offset for end of text value
+ return true;
+}
+
+void OleMainStream::splitPieces(const Pieces &s, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary) {
+ Pieces source = s;
+ dest1.clear();
+ dest2.clear();
+
+ int sumLength = 0;
+ std::size_t i = 0;
+ for (i = 0; i < source.size(); ++i) {
+ Piece piece = source.at(i);
+ if (piece.Length + sumLength >= boundary) {
+ Piece piece2 = piece;
+
+ piece.Length = boundary - sumLength;
+ piece.Type = type1;
+
+ piece2.Type = type2;
+ piece2.Offset += piece.Length * 2;
+ piece2.Length -= piece.Length;
+
+ if (piece.Length > 0) {
+ dest1.push_back(piece);
+ }
+ if (piece2.Length > 0) {
+ dest2.push_back(piece2);
+ }
+ ++i;
+ break;
+ }
+ sumLength += piece.Length;
+ piece.Type = type1;
+ dest1.push_back(piece);
+ }
+ for (; i < source.size(); ++i) {
+ Piece piece = source.at(i);
+ piece.Type = type2;
+ dest2.push_back(piece);
+ }
+
+}
+
+std::string OleMainStream::getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream) {
+ unsigned int clxOffset = OleUtil::getU4Bytes(headerBuffer, 0x01A2); //offset for CLX structure
+ unsigned int clxLength = OleUtil::getU4Bytes(headerBuffer, 0x01A6); //offset for value of CLX structure length
+
+ //1 step : loading CLX table from table stream
+ char *clxBuffer = new char[clxLength];
+ if (!tableStream.seek(clxOffset, true)) {
+ ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- error for seeking to CLX structure");
+ return std::string();
+ }
+ if (tableStream.read(clxBuffer, clxLength) != clxLength) {
+ ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure length is invalid");
+ return std::string();
+ }
+ std::string clx(clxBuffer, clxLength);
+ delete[] clxBuffer;
+
+ //2 step: searching for pieces table buffer at CLX
+ //(determines it by 0x02 as start symbol)
+ std::size_t from = 0;
+ std::size_t i;
+ std::string pieceTableBuffer;
+ while ((i = clx.find_first_of(0x02, from)) != std::string::npos) {
+ if (clx.size() < i + 1 + 4) {
+ ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure has invalid format");
+ return std::string();
+ }
+ unsigned int pieceTableLength = OleUtil::getU4Bytes(clx.c_str(), i + 1);
+ pieceTableBuffer = std::string(clx, i + 1 + 4);
+ if (pieceTableBuffer.length() != pieceTableLength) {
+ from = i + 1;
+ continue;
+ }
+ break;
+ }
+ return pieceTableBuffer;
+}
+
+
+bool OleMainStream::readPieceTable(const char *headerBuffer, const OleEntry &tableEntry) {
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string piecesTableBuffer = getPiecesTableBuffer(headerBuffer, tableStream);
+
+ if (piecesTableBuffer.empty()) {
+ return false;
+ }
+
+ //getting count of Character Positions for different types of subdocuments in Main Stream
+ int ccpText = OleUtil::get4Bytes(headerBuffer, 0x004C); //text
+ int ccpFtn = OleUtil::get4Bytes(headerBuffer, 0x0050); //footnote subdocument
+ int ccpHdd = OleUtil::get4Bytes(headerBuffer, 0x0054); //header subdocument
+ int ccpMcr = OleUtil::get4Bytes(headerBuffer, 0x0058); //macro subdocument
+ int ccpAtn = OleUtil::get4Bytes(headerBuffer, 0x005C); //comment subdocument
+ int ccpEdn = OleUtil::get4Bytes(headerBuffer, 0x0060); //endnote subdocument
+ int ccpTxbx = OleUtil::get4Bytes(headerBuffer, 0x0064); //textbox subdocument
+ int ccpHdrTxbx = OleUtil::get4Bytes(headerBuffer, 0x0068); //textbox subdocument of the header
+ int lastCP = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx;
+ if (lastCP != 0) {
+ ++lastCP;
+ }
+ lastCP += ccpText;
+
+ //getting the CP (character positions) and CP descriptors
+ std::vector<int> cp; //array of character positions for pieces
+ unsigned int j = 0;
+ for (j = 0; ; j += 4) {
+ if (piecesTableBuffer.size() < j + 4) {
+ ZLLogger::Instance().println("DocPlugin", "invalid piece table, cp ends not with a lastcp");
+ break;
+ }
+ int curCP = OleUtil::get4Bytes(piecesTableBuffer.c_str(), j);
+ cp.push_back(curCP);
+ if (curCP == lastCP) {
+ break;
+ }
+ }
+
+ if (cp.size() < 2) {
+ ZLLogger::Instance().println("DocPlugin", "invalid piece table, < 2 pieces");
+ return false;
+ }
+
+ std::vector<std::string> descriptors;
+ for (std::size_t k = 0; k < cp.size() - 1; ++k) {
+ //j + 4, because it should be taken after CP in PiecesTable Buffer
+ //k * 8, because it should be taken 8 byte for each descriptor
+ std::size_t substrFrom = j + 4 + k * 8;
+ if (piecesTableBuffer.size() < substrFrom + 8) {
+ ZLLogger::Instance().println("DocPlugin", "invalid piece table, problems with descriptors reading");
+ break;
+ }
+ descriptors.push_back(piecesTableBuffer.substr(substrFrom, 8));
+ }
+
+ //filling the Pieces vector
+ std::size_t minValidSize = std::min(cp.size() - 1, descriptors.size());
+ if (minValidSize == 0) {
+ ZLLogger::Instance().println("DocPlugin", "invalid piece table, there are no pieces");
+ return false;
+ }
+
+ for (std::size_t i = 0; i < minValidSize; ++i) {
+ //4byte integer with offset and ANSI flag
+ int fcValue = OleUtil::get4Bytes(descriptors.at(i).c_str(), 0x2); //offset for piece structure
+ Piece piece;
+ piece.IsANSI = (fcValue & 0x40000000) == 0x40000000; //ansi flag
+ piece.Offset = fcValue & 0x3FFFFFFF; //gettting offset for current piece
+ piece.Length = cp.at(i + 1) - cp.at(i);
+ myPieces.push_back(piece);
+ }
+
+ //split pieces into different types
+ Pieces piecesText, piecesFootnote, piecesOther;
+ splitPieces(myPieces, piecesText, piecesFootnote, Piece::PIECE_TEXT, Piece::PIECE_FOOTNOTE, ccpText);
+ splitPieces(piecesFootnote, piecesFootnote, piecesOther, Piece::PIECE_FOOTNOTE, Piece::PIECE_OTHER, ccpFtn);
+
+ myPieces.clear();
+ for (std::size_t i = 0; i < piecesText.size(); ++i) {
+ myPieces.push_back(piecesText.at(i));
+ }
+ for (std::size_t i = 0; i < piecesFootnote.size(); ++i) {
+ myPieces.push_back(piecesFootnote.at(i));
+ }
+ for (std::size_t i = 0; i < piecesOther.size(); ++i) {
+ myPieces.push_back(piecesOther.at(i));
+ }
+
+ //converting length and offset depending on isANSI
+ for (std::size_t i = 0; i < myPieces.size(); ++i) {
+ Piece &piece = myPieces.at(i);
+ if (!piece.IsANSI) {
+ piece.Length *= 2;
+ } else {
+ piece.Offset /= 2;
+ }
+ }
+
+ //filling startCP field
+ unsigned int curStartCP = 0;
+ for (std::size_t i = 0; i < myPieces.size(); ++i) {
+ Piece &piece = myPieces.at(i);
+ piece.startCP = curStartCP;
+ if (piece.IsANSI) {
+ curStartCP += piece.Length;
+ } else {
+ curStartCP += piece.Length / 2;
+ }
+ }
+ return true;
+}
+
+bool OleMainStream::readBookmarks(const char *headerBuffer, const OleEntry &tableEntry) {
+ //SttbfBkmk structure is a table of bookmark name strings
+ unsigned int beginNamesInfo = OleUtil::getU4Bytes(headerBuffer, 0x142); // address of SttbfBkmk structure
+ std::size_t namesInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x146); // length of SttbfBkmk structure
+
+ if (namesInfoLength == 0) {
+ return true; //there's no bookmarks
+ }
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string buffer;
+ if (!readToBuffer(buffer, beginNamesInfo, namesInfoLength, tableStream)) {
+ return false;
+ }
+
+ unsigned int recordsNumber = OleUtil::getU2Bytes(buffer.c_str(), 0x2); //count of records
+
+ std::vector<std::string> names;
+ unsigned int offset = 0x6; //initial offset
+ for (unsigned int i = 0; i < recordsNumber; ++i) {
+ if (buffer.size() < offset + 2) {
+ ZLLogger::Instance().println("DocPlugin", "problmes with reading bookmarks names");
+ break;
+ }
+ unsigned int length = OleUtil::getU2Bytes(buffer.c_str(), offset) * 2; //length of string in bytes
+ ZLUnicodeUtil::Ucs2String name;
+ for (unsigned int j = 0; j < length; j+=2) {
+ char ch1 = buffer.at(offset + 2 + j);
+ char ch2 = buffer.at(offset + 2 + j + 1);
+ ZLUnicodeUtil::Ucs2Char ucs2Char = (unsigned int)ch1 | ((unsigned int)ch2 << 8);
+ name.push_back(ucs2Char);
+ }
+ std::string utf8Name;
+ ZLUnicodeUtil::ucs2ToUtf8(utf8Name, name);
+ names.push_back(utf8Name);
+ offset += length + 2;
+ }
+
+ //plcfBkmkf structure is table recording beginning CPs of bookmarks
+ unsigned int beginCharPosInfo = OleUtil::getU4Bytes(headerBuffer, 0x14A); // address of plcfBkmkf structure
+ std::size_t charPosInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x14E); // length of plcfBkmkf structure
+
+ if (charPosInfoLen == 0) {
+ return true; //there's no bookmarks
+ }
+
+ if (!readToBuffer(buffer, beginCharPosInfo, charPosInfoLen, tableStream)) {
+ return false;
+ }
+
+ static const unsigned int BKF_SIZE = 4;
+ std::size_t size = calcCountOfPLC(charPosInfoLen, BKF_SIZE);
+ std::vector<unsigned int> charPage;
+ for (std::size_t index = 0, offset = 0; index < size; ++index, offset += 4) {
+ charPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset));
+ }
+
+ for (std::size_t i = 0; i < names.size(); ++i) {
+ if (i >= charPage.size()) {
+ break; //for the case if something in these structures goes wrong, to not to lose all bookmarks
+ }
+ Bookmark bookmark;
+ bookmark.CharPosition = charPage.at(i);
+ bookmark.Name = names.at(i);
+ myBookmarks.push_back(bookmark);
+ }
+
+ return true;
+}
+
+bool OleMainStream::readStylesheet(const char *headerBuffer, const OleEntry &tableEntry) {
+ //STSH structure is a stylesheet
+ unsigned int beginStshInfo = OleUtil::getU4Bytes(headerBuffer, 0xa2); // address of STSH structure
+ std::size_t stshInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xa6); // length of STSH structure
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ char *buffer = new char[stshInfoLength];
+ if (!tableStream.seek(beginStshInfo, true)) {
+ ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure");
+ return false;
+ }
+ if (tableStream.read(buffer, stshInfoLength) != stshInfoLength) {
+ ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure, invalid length");
+ return false;
+ }
+
+ std::size_t stdCount = (std::size_t)OleUtil::getU2Bytes(buffer, 2);
+ std::size_t stdBaseInFile = (std::size_t)OleUtil::getU2Bytes(buffer, 4);
+ myStyleSheet.resize(stdCount);
+
+ std::vector<bool> isFilled;
+ isFilled.resize(stdCount, false);
+
+ std::size_t stdLen = 0;
+ bool styleSheetWasChanged = false;
+ do { //make it in while loop, because some base style can be after their successors
+ styleSheetWasChanged = false;
+ for (std::size_t index = 0, offset = 2 + (std::size_t)OleUtil::getU2Bytes(buffer, 0); index < stdCount; index++, offset += 2 + stdLen) {
+ stdLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset);
+ if (isFilled.at(index)) {
+ continue;
+ }
+
+ if (stdLen == 0) {
+ //if record is empty, left it default
+ isFilled[index] = true;
+ continue;
+ }
+
+ Style styleInfo = myStyleSheet.at(index);
+
+ const unsigned int styleAndBaseType = OleUtil::getU2Bytes(buffer, offset + 4);
+ const unsigned int styleType = styleAndBaseType % 16;
+ const unsigned int baseStyleId = styleAndBaseType / 16;
+ if (baseStyleId == Style::STYLE_NIL || baseStyleId == Style::STYLE_USER) {
+ //if based on nil or user style, left default
+ } else {
+ int baseStyleIndex = getStyleIndex(baseStyleId, isFilled, myStyleSheet);
+ if (baseStyleIndex < 0) {
+ //this base style is not filled yet, so pass it at some time
+ continue;
+ }
+ styleInfo = myStyleSheet.at(baseStyleIndex);
+ styleInfo.StyleIdCurrent = Style::STYLE_INVALID;
+ }
+
+ // parse STD structure
+ unsigned int tmp = OleUtil::getU2Bytes(buffer, offset + 6);
+ unsigned int upxCount = tmp % 16;
+ styleInfo.StyleIdNext = tmp / 16;
+
+ //adding current style
+ myStyleSheet[index] = styleInfo;
+ isFilled[index] = true;
+ styleSheetWasChanged = true;
+
+ std::size_t pos = 2 + stdBaseInFile;
+ std::size_t nameLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos);
+ nameLen = nameLen * 2 + 2; //from Unicode characters to bytes + Unicode null charachter length
+ pos += 2 + nameLen;
+ if (pos % 2 != 0) {
+ ++pos;
+ }
+ if (pos >= stdLen) {
+ continue;
+ }
+ std::size_t upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos);
+ if (pos + upxLen > stdLen) {
+ //UPX length too large
+ continue;
+ }
+ //for style info styleType must be equal 1
+ if (styleType == 1 && upxCount >= 1) {
+ if (upxLen >= 2) {
+ styleInfo.StyleIdCurrent = OleUtil::getU2Bytes(buffer, offset + pos + 2);
+ getStyleInfo(0, buffer + offset + pos + 4, upxLen - 2, styleInfo);
+ myStyleSheet[index] = styleInfo;
+ }
+ pos += 2 + upxLen;
+ if (pos % 2 != 0) {
+ ++pos;
+ }
+ upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos);
+ }
+ if (upxLen == 0 || pos + upxLen > stdLen) {
+ //too small/too large
+ continue;
+ }
+ //for char info styleType can be equal 1 or 2
+ if ((styleType == 1 && upxCount >= 2) || (styleType == 2 && upxCount >= 1)) {
+ CharInfo charInfo;
+ getCharInfo(0, Style::STYLE_INVALID, buffer + offset + pos + 2, upxLen, charInfo);
+ styleInfo.CurrentCharInfo = charInfo;
+ myStyleSheet[index] = styleInfo;
+ }
+ }
+ } while (styleSheetWasChanged);
+ delete[] buffer;
+ return true;
+}
+
+bool OleMainStream::readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry) {
+ //PlcfbteChpx structure is table with formatting for particular run of text
+ unsigned int beginCharInfo = OleUtil::getU4Bytes(headerBuffer, 0xfa); // address of PlcfbteChpx structure
+ std::size_t charInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xfe); // length of PlcfbteChpx structure
+ if (charInfoLength < 4) {
+ return false;
+ }
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string buffer;
+ if (!readToBuffer(buffer, beginCharInfo, charInfoLength, tableStream)) {
+ return false;
+ }
+
+ static const unsigned int CHPX_SIZE = 4;
+ std::size_t size = calcCountOfPLC(charInfoLength, CHPX_SIZE);
+ std::vector<unsigned int> charBlocks;
+ for (std::size_t index = 0, offset = (size + 1) * 4; index < size; ++index, offset += CHPX_SIZE) {
+ charBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset));
+ }
+
+ char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE];
+ for (std::size_t index = 0; index < charBlocks.size(); ++index) {
+ seek(charBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true);
+ if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) {
+ return false;
+ }
+ unsigned int crun = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with crun (count of 'run of text')
+ for (unsigned int index2 = 0; index2 < crun; ++index2) {
+ unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4);
+ unsigned int chpxOffset = 2 * OleUtil::getU1Byte(formatPageBuffer, (crun + 1) * 4 + index2);
+ unsigned int len = OleUtil::getU1Byte(formatPageBuffer, chpxOffset);
+ unsigned int charPos = 0;
+ if (!offsetToCharPos(offset, charPos, myPieces)) {
+ continue;
+ }
+ unsigned int styleId = getStyleIdByCharPos(charPos, myStyleInfoList);
+
+ CharInfo charInfo = getStyleFromStylesheet(styleId, myStyleSheet).CurrentCharInfo;
+ if (chpxOffset != 0) {
+ getCharInfo(chpxOffset, styleId, formatPageBuffer + 1, len - 1, charInfo);
+ }
+ myCharInfoList.push_back(CharPosToCharInfo(charPos, charInfo));
+
+ if (chpxOffset != 0) {
+ InlineImageInfo pictureInfo;
+ if (getInlineImageInfo(chpxOffset, formatPageBuffer + 1, len - 1, pictureInfo)) {
+ myInlineImageInfoList.push_back(CharPosToInlineImageInfo(charPos, pictureInfo));
+ }
+ }
+
+ }
+ }
+ delete[] formatPageBuffer;
+ return true;
+}
+
+bool OleMainStream::readFloatingImages(const char *headerBuffer, const OleEntry &tableEntry) {
+ //Plcspa structure is a table with information for FSPA (File Shape Address)
+ unsigned int beginPicturesInfo = OleUtil::getU4Bytes(headerBuffer, 0x01DA); // address of Plcspa structure
+ if (beginPicturesInfo == 0) {
+ return true; //there's no office art objects
+ }
+ unsigned int picturesInfoLength = OleUtil::getU4Bytes(headerBuffer, 0x01DE); // length of Plcspa structure
+ if (picturesInfoLength < 4) {
+ return false;
+ }
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string buffer;
+ if (!readToBuffer(buffer, beginPicturesInfo, picturesInfoLength, tableStream)) {
+ return false;
+ }
+
+ static const unsigned int SPA_SIZE = 26;
+ std::size_t size = calcCountOfPLC(picturesInfoLength, SPA_SIZE);
+
+ std::vector<unsigned int> picturesBlocks;
+ for (std::size_t index = 0, tOffset = 0; index < size; ++index, tOffset += 4) {
+ picturesBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset));
+ }
+
+ for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += SPA_SIZE) {
+ unsigned int spid = OleUtil::getU4Bytes(buffer.c_str(), tOffset);
+ FloatImageInfo info;
+ unsigned int charPos = picturesBlocks.at(index);
+ info.ShapeId = spid;
+ myFloatImageInfoList.push_back(CharPosToFloatImageInfo(charPos, info));
+ }
+
+ //DggInfo structure is office art object table data
+ unsigned int beginOfficeArtContent = OleUtil::getU4Bytes(headerBuffer, 0x22A); // address of DggInfo structure
+ if (beginOfficeArtContent == 0) {
+ return true; //there's no office art objects
+ }
+ unsigned int officeArtContentLength = OleUtil::getU4Bytes(headerBuffer, 0x022E); // length of DggInfo structure
+ if (officeArtContentLength < 4) {
+ return false;
+ }
+
+ shared_ptr<OleStream> newTableStream = new OleStream(myStorage, tableEntry, myBaseStream);
+ shared_ptr<OleStream> newMainStream = new OleStream(myStorage, myOleEntry, myBaseStream);
+ if (newTableStream->open() && newMainStream->open()) {
+ myFLoatImageReader = new DocFloatImageReader(beginOfficeArtContent, officeArtContentLength, newTableStream, newMainStream);
+ myFLoatImageReader->readAll();
+ }
+ return true;
+}
+
+bool OleMainStream::readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry) {
+ //PlcBtePapx structure is table with formatting for all paragraphs
+ unsigned int beginParagraphInfo = OleUtil::getU4Bytes(headerBuffer, 0x102); // address of PlcBtePapx structure
+ std::size_t paragraphInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x106); // length of PlcBtePapx structure
+ if (paragraphInfoLength < 4) {
+ return false;
+ }
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string buffer;
+ if (!readToBuffer(buffer, beginParagraphInfo, paragraphInfoLength, tableStream)) {
+ return false;
+ }
+
+ static const unsigned int PAPX_SIZE = 4;
+ std::size_t size = calcCountOfPLC(paragraphInfoLength, PAPX_SIZE);
+
+ std::vector<unsigned int> paragraphBlocks;
+ for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += PAPX_SIZE) {
+ paragraphBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset));
+ }
+
+ char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE];
+ for (std::size_t index = 0; index < paragraphBlocks.size(); ++index) {
+ seek(paragraphBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true);
+ if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) {
+ return false;
+ }
+ const unsigned int paragraphsCount = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with 'cpara' value (count of paragraphs)
+ for (unsigned int index2 = 0; index2 < paragraphsCount; ++index2) {
+ const unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4);
+ unsigned int papxOffset = OleUtil::getU1Byte(formatPageBuffer, (paragraphsCount + 1) * 4 + index2 * 13) * 2;
+ if (papxOffset <= 0) {
+ continue;
+ }
+ unsigned int len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2;
+ if (len == 0) {
+ ++papxOffset;
+ len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2;
+ }
+
+ const unsigned int styleId = OleUtil::getU2Bytes(formatPageBuffer, papxOffset + 1);
+ Style styleInfo = getStyleFromStylesheet(styleId, myStyleSheet);
+
+ if (len >= 3) {
+ getStyleInfo(papxOffset, formatPageBuffer + 3, len - 3, styleInfo);
+ }
+
+ unsigned int charPos = 0;
+ if (!offsetToCharPos(offset, charPos, myPieces)) {
+ continue;
+ }
+ myStyleInfoList.push_back(CharPosToStyle(charPos, styleInfo));
+ }
+ }
+ delete[] formatPageBuffer;
+ return true;
+}
+
+bool OleMainStream::readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry) {
+ //PlcfSed structure is a section table
+ unsigned int beginOfText = OleUtil::getU4Bytes(headerBuffer, 0x18); //address of text's begin in main stream
+ unsigned int beginSectInfo = OleUtil::getU4Bytes(headerBuffer, 0xca); //address if PlcfSed structure
+
+ std::size_t sectInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xce); //length of PlcfSed structure
+ if (sectInfoLen < 4) {
+ return false;
+ }
+
+ OleStream tableStream(myStorage, tableEntry, myBaseStream);
+ std::string buffer;
+ if (!readToBuffer(buffer, beginSectInfo, sectInfoLen, tableStream)) {
+ return false;
+ }
+
+ static const unsigned int SED_SIZE = 12;
+ std::size_t decriptorsCount = calcCountOfPLC(sectInfoLen, SED_SIZE);
+
+ //saving the section offsets (in character positions)
+ std::vector<unsigned int> charPos;
+ for (std::size_t index = 0, tOffset = 0; index < decriptorsCount; ++index, tOffset += 4) {
+ unsigned int ulTextOffset = OleUtil::getU4Bytes(buffer.c_str(), tOffset);
+ charPos.push_back(beginOfText + ulTextOffset);
+ }
+
+ //saving sepx offsets
+ std::vector<unsigned int> sectPage;
+ for (std::size_t index = 0, tOffset = (decriptorsCount + 1) * 4; index < decriptorsCount; ++index, tOffset += SED_SIZE) {
+ sectPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset + 2));
+ }
+
+ //reading the section properties
+ char tmpBuffer[2];
+ for (std::size_t index = 0; index < sectPage.size(); ++index) {
+ if (sectPage.at(index) == 0xffffffffUL) { //check for invalid record, to make default section info
+ SectionInfo sectionInfo;
+ sectionInfo.CharPosition = charPos.at(index);
+ mySectionInfoList.push_back(sectionInfo);
+ continue;
+ }
+ //getting number of bytes to read
+ if (!seek(sectPage.at(index), true)) {
+ continue;
+ }
+ if (read(tmpBuffer, 2) != 2) {
+ continue;
+ }
+ std::size_t bytes = 2 + (std::size_t)OleUtil::getU2Bytes(tmpBuffer, 0);
+
+ if (!seek(sectPage.at(index), true)) {
+ continue;
+ }
+ char *formatPageBuffer = new char[bytes];
+ if (read(formatPageBuffer, bytes) != bytes) {
+ delete[] formatPageBuffer;
+ continue;
+ }
+ SectionInfo sectionInfo;
+ sectionInfo.CharPosition = charPos.at(index);
+ getSectionInfo(formatPageBuffer + 2, bytes - 2, sectionInfo);
+ mySectionInfoList.push_back(sectionInfo);
+ delete[] formatPageBuffer;
+ }
+ return true;
+}
+
+void OleMainStream::getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo) {
+ int tmp, toDelete, toAdd;
+ unsigned int offset = 0;
+ while (bytes >= offset + 2) {
+ unsigned int curPrlLength = 0;
+ switch (OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset)) {
+ case 0x2403:
+ styleInfo.Alignment = (Style::AlignmentType)OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0x4610:
+ styleInfo.LeftIndent += OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ if (styleInfo.LeftIndent < 0) {
+ styleInfo.LeftIndent = 0;
+ }
+ break;
+ case 0xc60d: // ChgTabsPapx
+ case 0xc615: // ChgTabs
+ tmp = OleUtil::get1Byte(grpprlBuffer, papxOffset + offset + 2);
+ if (tmp < 2) {
+ curPrlLength = 1;
+ break;
+ }
+ toDelete = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 3);
+ if (tmp < 2 + 2 * toDelete) {
+ curPrlLength = 1;
+ break;
+ }
+ toAdd = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 4 + 2 * toDelete);
+ if (tmp < 2 + 2 * toDelete + 2 * toAdd) {
+ curPrlLength = 1;
+ break;
+ }
+ break;
+ case 0x840e:
+ styleInfo.RightIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0x840f:
+ styleInfo.LeftIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0x8411:
+ styleInfo.FirstLineIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0xa413:
+ styleInfo.BeforeParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0xa414:
+ styleInfo.AfterParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2);
+ break;
+ case 0x2407:
+ styleInfo.HasPageBreakBefore = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2) == 0x01;
+ break;
+ default:
+ break;
+ }
+ if (curPrlLength == 0) {
+ curPrlLength = getPrlLength(grpprlBuffer, papxOffset + offset);
+ }
+ offset += curPrlLength;
+ }
+
+}
+
+void OleMainStream::getCharInfo(unsigned int chpxOffset, unsigned int /*styleId*/, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo) {
+ unsigned int sprm = 0; //single propery modifier
+ unsigned int offset = 0;
+ while (bytes >= offset + 2) {
+ switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) {
+ case 0x0835: //bold
+ sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2);
+ switch (sprm) {
+ case UNSET:
+ charInfo.FontStyle &= ~CharInfo::FONT_BOLD;
+ break;
+ case SET:
+ charInfo.FontStyle |= CharInfo::FONT_BOLD;
+ break;
+ case UNCHANGED:
+ break;
+ case NEGATION:
+ charInfo.FontStyle ^= CharInfo::FONT_BOLD;
+ break;
+ default:
+ break;
+ }
+ break;
+ case 0x0836: //italic
+ sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2);
+ switch (sprm) {
+ case UNSET:
+ charInfo.FontStyle &= ~CharInfo::FONT_ITALIC;
+ break;
+ case SET:
+ charInfo.FontStyle |= CharInfo::FONT_ITALIC;
+ break;
+ case UNCHANGED:
+ break;
+ case NEGATION:
+ charInfo.FontStyle ^= CharInfo::FONT_ITALIC;
+ break;
+ default:
+ break;
+ }
+ break;
+ case 0x4a43: //size of font
+ charInfo.FontSize = OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset + 2);
+ break;
+ default:
+ break;
+ }
+ offset += getPrlLength(grpprlBuffer, chpxOffset + offset);
+ }
+
+}
+
+void OleMainStream::getSectionInfo(const char *grpprlBuffer, std::size_t bytes, SectionInfo &sectionInfo) {
+ unsigned int tmp;
+ std::size_t offset = 0;
+ while (bytes >= offset + 2) {
+ switch (OleUtil::getU2Bytes(grpprlBuffer, offset)) {
+ case 0x3009: //new page
+ tmp = OleUtil::getU1Byte(grpprlBuffer, offset + 2);
+ sectionInfo.IsNewPage = (tmp != 0 && tmp != 1);
+ break;
+ default:
+ break;
+ }
+ offset += getPrlLength(grpprlBuffer, offset);
+ }
+}
+
+bool OleMainStream::getInlineImageInfo(unsigned int chpxOffset, const char *grpprlBuffer, unsigned int bytes, InlineImageInfo &pictureInfo) {
+ //p. 105 of [MS-DOC] documentation
+ unsigned int offset = 0;
+ bool isFound = false;
+ while (bytes >= offset + 2) {
+ switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) {
+ case 0x080a: // ole object, p.107 [MS-DOC]
+ if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) == 0x01) {
+ return false;
+ }
+ break;
+ case 0x0806: // is not a picture, but a binary data? (sprmCFData, p.106 [MS-DOC])
+ if (OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2) == 0x01) {
+ return false;
+ }
+ break;
+// case 0x0855: // sprmCFSpec, p.117 [MS-DOC], MUST BE applied with a value of 1 (see p.105 [MS-DOC])
+// if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) != 0x01) {
+// return false;
+// }
+// break;
+ case 0x6a03: // location p.105 [MS-DOC]
+ pictureInfo.DataPosition = OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2);
+ isFound = true;
+ break;
+ default:
+ break;
+ }
+ offset += getPrlLength(grpprlBuffer, chpxOffset + offset);
+ }
+ return isFound;
+}
+
+OleMainStream::Style OleMainStream::getStyleFromStylesheet(unsigned int styleId, const StyleSheet &stylesheet) {
+ //TODO optimize it: StyleSheet can be map structure with styleId key
+ Style style;
+ if (styleId != Style::STYLE_INVALID && styleId != Style::STYLE_NIL && styleId != Style::STYLE_USER) {
+ for (std::size_t index = 0; index < stylesheet.size(); ++index) {
+ if (stylesheet.at(index).StyleIdCurrent == styleId) {
+ return stylesheet.at(index);
+ }
+ }
+ }
+ style.StyleIdCurrent = styleId;
+ return style;
+}
+
+int OleMainStream::getStyleIndex(unsigned int styleId, const std::vector<bool> &isFilled, const StyleSheet &stylesheet) {
+ //TODO optimize it: StyleSheet can be map structure with styleId key
+ //in that case, this method will be excess
+ if (styleId == Style::STYLE_INVALID) {
+ return -1;
+ }
+ for (int index = 0; index < (int)stylesheet.size(); ++index) {
+ if (isFilled.at(index) && stylesheet.at(index).StyleIdCurrent == styleId) {
+ return index;
+ }
+ }
+ return -1;
+}
+
+unsigned int OleMainStream::getStyleIdByCharPos(unsigned int charPos, const StyleInfoList &styleInfoList) {
+ unsigned int styleId = Style::STYLE_INVALID;
+ for (std::size_t i = 0; i < styleInfoList.size(); ++i) {
+ const Style &info = styleInfoList.at(i).second;
+ if (i == styleInfoList.size() - 1) { //if last
+ styleId = info.StyleIdCurrent;
+ break;
+ }
+ unsigned int curOffset = styleInfoList.at(i).first;
+ unsigned int nextOffset = styleInfoList.at(i + 1).first;
+ if (charPos >= curOffset && charPos < nextOffset) {
+ styleId = info.StyleIdCurrent;
+ break;
+ }
+ }
+ return styleId;
+}
+
+bool OleMainStream::offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces) {
+ if (pieces.empty()) {
+ return false;
+ }
+ if ((unsigned int)pieces.front().Offset > offset) {
+ charPos = 0;
+ return true;
+ }
+ if ((unsigned int)(pieces.back().Offset + pieces.back().Length) <= offset) {
+ return false;
+ }
+
+ std::size_t pieceNumber = 0;
+ for (std::size_t i = 0; i < pieces.size(); ++i) {
+ if (i == pieces.size() - 1) { //if last
+ pieceNumber = i;
+ break;
+ }
+ unsigned int curOffset = pieces.at(i).Offset;
+ unsigned int nextOffset = pieces.at(i + 1).Offset;
+ if (offset >= curOffset && offset < nextOffset) {
+ pieceNumber = i;
+ break;
+ }
+ }
+
+ const Piece &piece = pieces.at(pieceNumber);
+ unsigned int diffOffset = offset - piece.Offset;
+ if (!piece.IsANSI) {
+ diffOffset /= 2;
+ }
+ charPos = piece.startCP + diffOffset;
+ return true;
+}
+
+bool OleMainStream::readToBuffer(std::string &result, unsigned int offset, std::size_t length, OleStream &stream) {
+ char *buffer = new char[length];
+ stream.seek(offset, true);
+ if (stream.read(buffer, length) != length) {
+ return false;
+ }
+ result = std::string(buffer, length);
+ delete[] buffer;
+ return true;
+}
+
+unsigned int OleMainStream::calcCountOfPLC(unsigned int totalSize, unsigned int elementSize) {
+ //calculates count of elements in PLC structure, formula from p.30 [MS-DOC]
+ return (totalSize - 4) / (4 + elementSize);
+}
+
+unsigned int OleMainStream::getPrlLength(const char *grpprlBuffer, unsigned int byteNumber) {
+ unsigned int tmp;
+ unsigned int opCode = OleUtil::getU2Bytes(grpprlBuffer, byteNumber);
+ switch (opCode & 0xe000) {
+ case 0x0000:
+ case 0x2000:
+ return 3;
+ case 0x4000:
+ case 0x8000:
+ case 0xA000:
+ return 4;
+ case 0xE000:
+ return 5;
+ case 0x6000:
+ return 6;
+ case 0xC000:
+ //counting of info length
+ tmp = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 2);
+ if (opCode == 0xc615 && tmp == 255) {
+ unsigned int del = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 3);
+ unsigned int add = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 4 + del * 4);
+ tmp = 2 + del * 4 + add * 3;
+ }
+ return 3 + tmp;
+ default:
+ return 1;
+ }
+}