diff options
author | Michele Calgaro <[email protected]> | 2024-06-07 23:30:05 +0900 |
---|---|---|
committer | Michele Calgaro <[email protected]> | 2024-06-07 23:30:05 +0900 |
commit | 17b259df9cb6b28779d4881b2b6c805ee2e48eea (patch) | |
tree | 5ed61937459cb7081089111b0242c01ec178f1f3 /reader/src/formats | |
parent | 1cba8bce178eb2d6719c6f7f21e2c9352c5513a6 (diff) | |
download | tde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.tar.gz tde-ebook-reader-17b259df9cb6b28779d4881b2b6c805ee2e48eea.zip |
Rename to tde-ebook-reader
Signed-off-by: Michele Calgaro <[email protected]>
Diffstat (limited to 'reader/src/formats')
201 files changed, 23831 insertions, 0 deletions
diff --git a/reader/src/formats/EncodedTextReader.cpp b/reader/src/formats/EncodedTextReader.cpp new file mode 100644 index 0000000..12102c1 --- /dev/null +++ b/reader/src/formats/EncodedTextReader.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "EncodedTextReader.h" + +EncodedTextReader::EncodedTextReader(const std::string &encoding) { + ZLEncodingCollection &collection = ZLEncodingCollection::Instance(); + ZLEncodingConverterInfoPtr info = collection.info(encoding); + myConverter = !info.isNull() ? info->createConverter() : collection.defaultConverter(); +} + +EncodedTextReader::~EncodedTextReader() { +} diff --git a/reader/src/formats/EncodedTextReader.h b/reader/src/formats/EncodedTextReader.h new file mode 100644 index 0000000..8035508 --- /dev/null +++ b/reader/src/formats/EncodedTextReader.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __ENCODEDTEXTREADER_H__ +#define __ENCODEDTEXTREADER_H__ + +#include <string> + +#include <ZLEncodingConverter.h> + +class EncodedTextReader { + +protected: + EncodedTextReader(const std::string &encoding); + virtual ~EncodedTextReader(); + +protected: + shared_ptr<ZLEncodingConverter> myConverter; +}; + +#endif /* __ENCODEDTEXTREADER_H__ */ diff --git a/reader/src/formats/FormatPlugin.cpp b/reader/src/formats/FormatPlugin.cpp new file mode 100644 index 0000000..059a53b --- /dev/null +++ b/reader/src/formats/FormatPlugin.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> +#include <ZLLanguageDetector.h> +#include <ZLImage.h> + +#include "FormatPlugin.h" + +#include "../library/Book.h" + +bool FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force) { + std::string language = book.language(); + std::string encoding = book.encoding(); + if (!force && !encoding.empty() && !language.empty()) { + return true; + } + + bool detected = false; + + PluginCollection &collection = PluginCollection::Instance(); + if (language.empty()) { + language = collection.DefaultLanguageOption.value(); + } + if (encoding.empty()) { + encoding = collection.DefaultEncodingOption.value(); + } + if (collection.LanguageAutoDetectOption.value() && stream.open()) { + static const int BUFSIZE = 65536; + char *buffer = new char[BUFSIZE]; + const std::size_t size = stream.read(buffer, BUFSIZE); + stream.close(); + shared_ptr<ZLLanguageDetector::LanguageInfo> info = + ZLLanguageDetector().findInfo(buffer, size); + delete[] buffer; + if (!info.isNull()) { + detected = true; + if (!info->Language.empty()) { + language = info->Language; + } + encoding = info->Encoding; + if (encoding == "US-ASCII" || encoding == "ISO-8859-1") { + encoding = "windows-1252"; + } + } + } + book.setEncoding(encoding); + book.setLanguage(language); + return detected; +} + +bool FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force) { + std::string language = book.language(); + if (!force && !language.empty()) { + return true; + } + + bool detected = false; + + PluginCollection &collection = PluginCollection::Instance(); + if (language.empty()) { + language = collection.DefaultLanguageOption.value(); + } + if (collection.LanguageAutoDetectOption.value() && stream.open()) { + static const int BUFSIZE = 65536; + char *buffer = new char[BUFSIZE]; + const std::size_t size = stream.read(buffer, BUFSIZE); + stream.close(); + shared_ptr<ZLLanguageDetector::LanguageInfo> info = + ZLLanguageDetector().findInfoForEncoding(encoding, buffer, size, -20000); + delete[] buffer; + if (!info.isNull()) { + if (!info->Language.empty()) { + detected = true; + language = info->Language; + } + } + } + book.setLanguage(language); + return detected; +} + +const std::string &FormatPlugin::tryOpen(const ZLFile&) const { + static const std::string EMPTY = ""; + return EMPTY; +} + +shared_ptr<const ZLImage> FormatPlugin::coverImage(const ZLFile &file) const { + return 0; +} diff --git a/reader/src/formats/FormatPlugin.h b/reader/src/formats/FormatPlugin.h new file mode 100644 index 0000000..5e1075e --- /dev/null +++ b/reader/src/formats/FormatPlugin.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FORMATPLUGIN_H__ +#define __FORMATPLUGIN_H__ + +#include <string> +#include <vector> + +#include <shared_ptr.h> +#include <ZLOptions.h> + +class Book; +class BookModel; +class ZLOptionsDialog; +class ZLOptionsDialogTab; +class ZLFile; +class ZLInputStream; +class ZLImage; + +class FormatInfoPage { + +protected: + FormatInfoPage(); + +public: + virtual ~FormatInfoPage(); +}; + +class FormatPlugin { + +protected: + FormatPlugin(); + +public: + virtual ~FormatPlugin(); + + virtual bool providesMetaInfo() const = 0; + virtual bool acceptsFile(const ZLFile &file) const = 0; + virtual FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); + + virtual const std::string &tryOpen(const ZLFile &file) const; + virtual bool readMetaInfo(Book &book) const = 0; + virtual bool readLanguageAndEncoding(Book &book) const = 0; + virtual bool readModel(BookModel &model) const = 0; + virtual shared_ptr<const ZLImage> coverImage(const ZLFile &file) const; + +protected: + static bool detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force = false); + static bool detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force = false); +}; + +class PluginCollection { + +public: + ZLBooleanOption LanguageAutoDetectOption; + ZLStringOption DefaultLanguageOption; + ZLStringOption DefaultEncodingOption; + +public: + static PluginCollection &Instance(); + static void deleteInstance(); + +private: + PluginCollection(); + +public: + shared_ptr<FormatPlugin> plugin(const ZLFile &file, bool strong); + shared_ptr<FormatPlugin> plugin(const Book &book); + +private: + static PluginCollection *ourInstance; + + std::vector<shared_ptr<FormatPlugin> > myPlugins; +}; + +inline FormatInfoPage::FormatInfoPage() {} +inline FormatInfoPage::~FormatInfoPage() {} +inline FormatPlugin::FormatPlugin() {} +inline FormatPlugin::~FormatPlugin() {} +inline FormatInfoPage *FormatPlugin::createInfoPage(ZLOptionsDialog&, const ZLFile&) { return 0; } + +#endif /* __FORMATPLUGIN_H__ */ diff --git a/reader/src/formats/PluginCollection.cpp b/reader/src/formats/PluginCollection.cpp new file mode 100644 index 0000000..d120de1 --- /dev/null +++ b/reader/src/formats/PluginCollection.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLibrary.h> +#include <ZLFile.h> + +#include "FormatPlugin.h" + +#include "../library/Book.h" + +#include "fb2/FB2Plugin.h" +//#include "docbook/DocBookPlugin.h" +#include "html/HtmlPlugin.h" +#include "txt/TxtPlugin.h" +#include "pdb/PdbPlugin.h" +#include "tcr/TcrPlugin.h" +#include "oeb/OEBPlugin.h" +#include "chm/CHMPlugin.h" +#include "rtf/RtfPlugin.h" +#include "openreader/OpenReaderPlugin.h" +#include "doc/DocPlugin.h" +//#include "pdf/PdfPlugin.h" + +PluginCollection *PluginCollection::ourInstance = 0; + +PluginCollection &PluginCollection::Instance() { + if (ourInstance == 0) { + ourInstance = new PluginCollection(); + ourInstance->myPlugins.push_back(new FB2Plugin()); + //ourInstance->myPlugins.push_back(new DocBookPlugin()); + ourInstance->myPlugins.push_back(new HtmlPlugin()); + ourInstance->myPlugins.push_back(new TxtPlugin()); + ourInstance->myPlugins.push_back(new PluckerPlugin()); + ourInstance->myPlugins.push_back(new PalmDocPlugin()); + ourInstance->myPlugins.push_back(new MobipocketPlugin()); + ourInstance->myPlugins.push_back(new EReaderPlugin()); + ourInstance->myPlugins.push_back(new ZTXTPlugin()); + ourInstance->myPlugins.push_back(new TcrPlugin()); + ourInstance->myPlugins.push_back(new CHMPlugin()); + ourInstance->myPlugins.push_back(new OEBPlugin()); + ourInstance->myPlugins.push_back(new RtfPlugin()); + ourInstance->myPlugins.push_back(new OpenReaderPlugin()); + ourInstance->myPlugins.push_back(new DocPlugin()); + //ourInstance->myPlugins.push_back(new PdfPlugin()); + } + return *ourInstance; +} + +void PluginCollection::deleteInstance() { + if (ourInstance != 0) { + delete ourInstance; + ourInstance = 0; + } +} + +PluginCollection::PluginCollection() : + LanguageAutoDetectOption(ZLCategoryKey::CONFIG, "Format", "AutoDetect", true), + DefaultLanguageOption(ZLCategoryKey::CONFIG, "Format", "DefaultLanguageS", ZLibrary::Language()), + DefaultEncodingOption(ZLCategoryKey::CONFIG, "Format", "DefaultEncoding", "UTF-8") { +} + +shared_ptr<FormatPlugin> PluginCollection::plugin(const Book &book) { + return plugin(book.file(), false); +} + +shared_ptr<FormatPlugin> PluginCollection::plugin(const ZLFile &file, bool strong) { + for (std::vector<shared_ptr<FormatPlugin> >::const_iterator it = myPlugins.begin(); it != myPlugins.end(); ++it) { + if ((!strong || (*it)->providesMetaInfo()) && (*it)->acceptsFile(file)) { + return *it; + } + } + return 0; +} diff --git a/reader/src/formats/chm/BitStream.cpp b/reader/src/formats/chm/BitStream.cpp new file mode 100644 index 0000000..bf6c642 --- /dev/null +++ b/reader/src/formats/chm/BitStream.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include "BitStream.h" + +const int BitStream::BufferSize = sizeof(unsigned int) * 8; + +unsigned int BitStream::get4BytesDirect() { + if (myByteStream + 4 > myByteStreamEnd) { + return 0; + } + unsigned int bytes = *myByteStream++ << 24; + bytes += *myByteStream++ << 16; + bytes += *myByteStream++ << 8; + bytes += *myByteStream++; + return bytes; +} + +bool BitStream::getBytesDirect(unsigned char *buffer, unsigned int length) { + if (myByteStream + length > myByteStreamEnd) { + return false; + } + std::memcpy(buffer, myByteStream, length); + myByteStream += length; + return true; +} diff --git a/reader/src/formats/chm/BitStream.h b/reader/src/formats/chm/BitStream.h new file mode 100644 index 0000000..80c1e25 --- /dev/null +++ b/reader/src/formats/chm/BitStream.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __BITSTREAM_H__ +#define __BITSTREAM_H__ + +#include <string> + +class BitStream { + +public: + static const int BufferSize; + +public: + BitStream(); + void setData(const std::string &data); + void reset(); + unsigned int peek(unsigned char length); + void remove(unsigned char length); + unsigned int get(unsigned char length); + unsigned int bytesLeft() const; + + unsigned int get4BytesDirect(); + bool getBytesDirect(unsigned char *buffer, unsigned int length); + +private: + bool ensure(unsigned char length); + +private: + unsigned int myBuffer; + unsigned char myBitCounter; + const unsigned char *myByteStream; + const unsigned char *myByteStreamEnd; + +private: + BitStream(const BitStream&); + const BitStream &operator = (const BitStream&); +}; + +inline BitStream::BitStream() : myBuffer(0), myBitCounter(0) { +} + +inline void BitStream::setData(const std::string &data) { + myByteStream = (const unsigned char*)data.data(); + myByteStreamEnd = myByteStream + data.length(); + myBuffer = 0; + myBitCounter = 0; +} + +inline void BitStream::reset() { + myByteStream -= myBitCounter / 8; + myBuffer = 0; + myBitCounter = 0; +} + +inline bool BitStream::ensure(unsigned char length) { + while ((myBitCounter < length) && (bytesLeft() >= 2)) { + myBuffer |= ((myByteStream[1] << 8) | myByteStream[0]) << (BitStream::BufferSize - 16 - myBitCounter); + myBitCounter += 16; + myByteStream += 2; + } + return myBitCounter >= length; +} + +inline unsigned int BitStream::peek(unsigned char length) { + ensure(length); + return (length > 0) ? (myBuffer >> (BufferSize - length)) : 0; +} + +inline void BitStream::remove(unsigned char length) { + if (ensure(length)) { + myBuffer <<= length; + myBitCounter -= length; + } +} + +inline unsigned int BitStream::get(unsigned char length) { + unsigned int bits; + if (length > 16) { + bits = peek(length - 16) << 16; + remove(length - 16); + bits += peek(16); + remove(16); + } else { + bits = peek(length); + remove(length); + } + return bits; +} + +inline unsigned int BitStream::bytesLeft() const { + return myByteStreamEnd - myByteStream; +} + +#endif /* __BITSTREAM_H__ */ diff --git a/reader/src/formats/chm/CHMFile.cpp b/reader/src/formats/chm/CHMFile.cpp new file mode 100644 index 0000000..8c62bca --- /dev/null +++ b/reader/src/formats/chm/CHMFile.cpp @@ -0,0 +1,490 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> +#include <ZLInputStream.h> + +#include "CHMFile.h" +#include "CHMReferenceCollection.h" + +#include "LZXDecompressor.h" + +static std::string readString(ZLInputStream &stream, std::size_t length) { + std::string string(length, ' '); + stream.read(const_cast<char*>(string.data()), length); + return string; +} + +static unsigned short readUnsignedWord(ZLInputStream &stream) { + unsigned char buffer[2]; + stream.read((char*)buffer, 2); + unsigned short result = buffer[1]; + result = result << 8; + result += buffer[0]; + return result; +} + +static unsigned long readUnsignedDWord(ZLInputStream &stream) { + unsigned long lowPart = readUnsignedWord(stream); + unsigned long highPart = readUnsignedWord(stream); + return (highPart << 16) + lowPart; +} + +static unsigned long long readUnsignedQWord(ZLInputStream &stream) { + unsigned long long lowPart = readUnsignedDWord(stream); + unsigned long long highPart = readUnsignedDWord(stream); + return (highPart << 32) + lowPart; +} + +static unsigned long long readEncodedInteger(ZLInputStream &stream) { + unsigned long long result = 0; + char part; + do { + result = result << 7; + stream.read(&part, 1); + result += part & 0x7F; + } while (part & -0x80); + return result; +} + +CHMInputStream::CHMInputStream(shared_ptr<ZLInputStream> base, const CHMFileInfo::SectionInfo §ionInfo, std::size_t offset, std::size_t size) : myBase(base), mySectionInfo(sectionInfo), mySize(size) { + myBaseStartIndex = offset / 0x8000; + myBaseStartIndex -= myBaseStartIndex % sectionInfo.ResetInterval; + myBytesToSkip = offset - myBaseStartIndex * 0x8000; + myOutData = new unsigned char[0x8000]; +} + +CHMInputStream::~CHMInputStream() { + close(); + delete[] myOutData; +} + +bool CHMInputStream::open() { + myOffset = 0; + myDoSkip = true; + myBaseIndex = myBaseStartIndex; + if (myDecompressor.isNull()) { + myDecompressor = new LZXDecompressor(mySectionInfo.WindowSizeIndex); + } else { + myDecompressor->reset(); + } + myOutDataOffset = 0; + myOutDataLength = 0; + return true; +} + +std::size_t CHMInputStream::read(char *buffer, std::size_t maxSize) { + if (myDoSkip) { + do_read(0, myBytesToSkip); + myDoSkip = false; + } + std::size_t realSize = do_read(buffer, std::min(maxSize, mySize - myOffset)); + myOffset += realSize; + return realSize; +} + +std::size_t CHMInputStream::do_read(char *buffer, std::size_t maxSize) { + std::size_t realSize = 0; + do { + if (myOutDataLength == 0) { + if (myBaseIndex >= mySectionInfo.ResetTable.size()) { + break; + } + const bool isTail = myBaseIndex + 1 == mySectionInfo.ResetTable.size(); + const std::size_t start = mySectionInfo.ResetTable[myBaseIndex]; + const std::size_t end = isTail ? mySectionInfo.CompressedSize : mySectionInfo.ResetTable[myBaseIndex + 1]; + myOutDataLength = isTail ? mySectionInfo.UncompressedSize % 0x8000 : 0x8000; + myOutDataOffset = 0; + + myInData.erase(); + myInData.append(end - start, '\0'); + myBase->seek(mySectionInfo.Offset + start, true); + myBase->read((char*)myInData.data(), myInData.length()); + if (myBaseIndex % mySectionInfo.ResetInterval == 0) { + myDecompressor->reset(); + } + ++myBaseIndex; + + if (!myDecompressor->decompress(myInData, myOutData, myOutDataLength)) { + break; + } + } + const std::size_t partSize = std::min(myOutDataLength, maxSize); + if (buffer != 0) { + std::memcpy(buffer + realSize, myOutData + myOutDataOffset, partSize); + } + maxSize -= partSize; + realSize += partSize; + myOutDataLength -= partSize; + myOutDataOffset += partSize; + } while (maxSize != 0); + return realSize; +} + +void CHMInputStream::close() { + myDecompressor = 0; +} + +void CHMInputStream::seek(int offset, bool absoluteOffset) { + if (absoluteOffset) { + offset -= myOffset; + } + if (offset > 0) { + read(0, offset); + } else if (offset < 0) { + open(); + read(0, std::max(offset + (int)myOffset, 0)); + } +} + +std::size_t CHMInputStream::offset() const { + return myOffset; +} + +std::size_t CHMInputStream::sizeOfOpened() { + return mySize; +} + +shared_ptr<ZLInputStream> CHMFileInfo::entryStream(shared_ptr<ZLInputStream> base, const std::string &name) const { + RecordMap::const_iterator it = myRecords.find(ZLUnicodeUtil::toLower(name)); + if (it == myRecords.end()) { + return 0; + } + const RecordInfo &recordInfo = it->second; + if (recordInfo.Length == 0) { + return 0; + } + if (recordInfo.Section == 0) { + // TODO: implement + return 0; + } + if (recordInfo.Section > mySectionInfos.size()) { + return 0; + } + const SectionInfo §ionInfo = mySectionInfos[recordInfo.Section - 1]; + if (recordInfo.Offset + recordInfo.Length > sectionInfo.UncompressedSize) { + return 0; + } + + return new CHMInputStream(base, sectionInfo, recordInfo.Offset, recordInfo.Length); +} + +CHMFileInfo::CHMFileInfo(const ZLFile &file) : myFilePath(file.path()) { +} + +bool CHMFileInfo::moveToEntry(ZLInputStream &stream, const std::string &entryName) { + RecordMap::const_iterator it = myRecords.find(entryName); + if (it == myRecords.end()) { + return false; + } + RecordInfo recordInfo = it->second; + if (recordInfo.Section > mySectionInfos.size()) { + return false; + } + if (recordInfo.Section != 0) { + // TODO: ??? + return false; + } + + stream.seek(mySection0Offset + recordInfo.Offset, true); + return true; +} + +bool CHMFileInfo::init(ZLInputStream &stream) { + { + // header start + if (readString(stream, 4) != "ITSF") { + return false; + } + + unsigned long version = readUnsignedDWord(stream); + + // DWORD total length + // DWORD unknown + // DWORD timestamp + // DWORD language id + // 0x10 bytes 1st GUID + // 0x10 bytes 2nd GUID + // QWORD section 0 offset + // QWORD section 0 length + stream.seek(4 * 4 + 2 * 0x10 + 2 * 8, false); + + unsigned long long sectionOffset1 = readUnsignedQWord(stream); + unsigned long long sectionLength1 = readUnsignedQWord(stream); + mySection0Offset = sectionOffset1 + sectionLength1; + // header end + + // additional header data start + if (version > 2) { + mySection0Offset = readUnsignedQWord(stream); + } + // additional header data end + + stream.seek(sectionOffset1, true); + // header section 1 start + // directory header start + if (readString(stream, 4) != "ITSP") { + return false; + } + + // DWORD version + // DWORD length + // DWORD 0x000A + // DWORD chunk size + // DWORD density + // DWORD depth + // DWORD root chunk number + // DWORD first chunk number + // DWORD last chunk number + // DWORD -1 + stream.seek(10 * 4, false); + unsigned long dirChunkNumber = readUnsignedDWord(stream); + // ... + stream.seek(36, false); + // header section 1 end + + std::size_t nextOffset = stream.offset(); + for (unsigned long i = 0; i < dirChunkNumber; ++i) { + nextOffset += 4096; + std::string header = readString(stream, 4); + if (header == "PMGL") { + unsigned long quickRefAreaSize = readUnsignedDWord(stream) % 4096; + stream.seek(12, false); + std::size_t startOffset = stream.offset(); + std::size_t oldOffset = startOffset; + while (startOffset < nextOffset - quickRefAreaSize) { + int nameLength = readEncodedInteger(stream); + std::string name = readString(stream, nameLength); + int contentSection = readEncodedInteger(stream); + int offset = readEncodedInteger(stream); + int length = readEncodedInteger(stream); + if (name.substr(0, 2) != "::") { + name = ZLUnicodeUtil::toLower(name); + } + myRecords.insert( + std::make_pair( + name, + CHMFileInfo::RecordInfo(contentSection, offset, length) + ) + ); + startOffset = stream.offset(); + if (oldOffset == startOffset) { + break; + } + oldOffset = startOffset; + } + } else if (header == "PMGI") { + unsigned long quickRefAreaSize = readUnsignedDWord(stream); + std::size_t startOffset = stream.offset(); + std::size_t oldOffset = startOffset; + while (startOffset < nextOffset - quickRefAreaSize) { + int nameLength = readEncodedInteger(stream); + std::string name = readString(stream, nameLength); + // chunk number + readEncodedInteger(stream); + startOffset = stream.offset(); + if (oldOffset == startOffset) { + break; + } + oldOffset = startOffset; + } + } + stream.seek(nextOffset, true); + if (stream.offset() != nextOffset) { + break; + } + } + } + + { + if (!moveToEntry(stream, "::DataSpace/NameList")) { + return false; + } + stream.seek(2, false); + const int sectionNumber = readUnsignedWord(stream); + for (int i = 0; i < sectionNumber; ++i) { + const int length = readUnsignedWord(stream); + std::string sectionName; + sectionName.reserve(length); + for (int j = 0; j < length; ++j) { + sectionName += (char)readUnsignedWord(stream); + } + stream.seek(2, false); + mySectionNames.push_back(sectionName); + } + } + + { + for (unsigned int i = 1; i < mySectionNames.size(); ++i) { + RecordMap::const_iterator it = + myRecords.find("::DataSpace/Storage/" + mySectionNames[i] + "/Content"); + if (it == myRecords.end()) { + return false; + } + RecordInfo recordInfo = it->second; + if (recordInfo.Section != 0) { + return false; + } + mySectionInfos.push_back(SectionInfo()); + SectionInfo &info = mySectionInfos.back(); + info.Offset = mySection0Offset + recordInfo.Offset; + info.Length = recordInfo.Length; + + if (!moveToEntry(stream, "::DataSpace/Storage/" + mySectionNames[i] + "/ControlData")) { + return false; + } + stream.seek(4, false); + std::string lzxc = readString(stream, 4); + if (lzxc != "LZXC") { + return false; + } + const int version = readUnsignedDWord(stream); + if ((version <= 0) || (version > 2)) { + return false; + } + info.ResetInterval = readUnsignedDWord(stream); + if (version == 1) { + info.ResetInterval /= 0x8000; + } + info.WindowSizeIndex = (version == 1) ? 0 : 15; + { + int ws = readUnsignedDWord(stream); + if (ws > 0) { + while ((ws & 1) == 0) { + ws >>= 1; + info.WindowSizeIndex++; + } + } + } + + if (!moveToEntry(stream, "::DataSpace/Storage/" + mySectionNames[i] + "/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable")) { + return false; + } + stream.seek(4, false); + const std::size_t entriesNumber = readUnsignedDWord(stream); + if (entriesNumber == 0) { + return false; + } + if (entriesNumber > 2048) { + // file size is greater than 60 Mb + return false; + } + info.ResetTable.reserve(entriesNumber); + stream.seek(8, false); + info.UncompressedSize = readUnsignedQWord(stream); + if ((info.UncompressedSize - 1) / 0x8000 != entriesNumber - 1) { + return false; + } + info.CompressedSize = readUnsignedQWord(stream); + stream.seek(8, false); + std::size_t previous = 0; + for (std::size_t j = 0; j < entriesNumber; ++j) { + std::size_t value = readUnsignedQWord(stream); + if ((j > 0) == (value <= previous)) { + return false; + } + info.ResetTable.push_back(value); + previous = value; + } + } + } + + return true; +} + +static std::string readNTString(ZLInputStream &stream) { + std::string s; + char c; + while (stream.read(&c, 1) == 1) { + if (c == '\0') { + break; + } else { + s += c; + } + } + return CHMReferenceCollection::fullReference("/", s); +} + +bool CHMFileInfo::FileNames::empty() const { + return Start.empty() && TOC.empty() && Home.empty() && Index.empty(); +} + +CHMFileInfo::FileNames CHMFileInfo::sectionNames(shared_ptr<ZLInputStream> base) const { + FileNames names; + shared_ptr<ZLInputStream> stringsStream = entryStream(base, "/#STRINGS"); + if (!stringsStream.isNull() && stringsStream->open()) { + std::vector<std::string> fileNames; + int tocIndex = -1; + int indexIndex = -1; + for (int i = 0; i < 12; ++i) { + std::string argument = readNTString(*stringsStream); + if (argument.empty() || (argument[argument.length() - 1] == '/')) { + continue; + } + if (myRecords.find(argument) == myRecords.end()) { + continue; + } + if ((tocIndex == -1) && ZLStringUtil::stringEndsWith(argument, ".hhc")) { + tocIndex = fileNames.size(); + names.TOC = argument; + } else if ((indexIndex == -1) && ZLStringUtil::stringEndsWith(argument, ".hhk")) { + indexIndex = fileNames.size(); + names.Index = argument; + } + fileNames.push_back(argument); + } + std::size_t startIndex = std::max(3, std::max(tocIndex, indexIndex) + 1); + if (startIndex < 11) { + if (startIndex < fileNames.size()) { + names.Start = fileNames[startIndex]; + } + if (startIndex + 1 < fileNames.size()) { + names.Home = fileNames[startIndex + 1]; + } + } + stringsStream->close(); + } + if (names.TOC.empty()) { + for (RecordMap::const_iterator it = myRecords.begin(); it != myRecords.end(); ++it) { + if (ZLStringUtil::stringEndsWith(it->first, ".hhc")) { + names.TOC = it->first; + break; + } + } + } + if (names.empty()) { + for (RecordMap::const_iterator it = myRecords.begin(); it != myRecords.end(); ++it) { + if ((ZLStringUtil::stringEndsWith(it->first, ".htm")) || + (ZLStringUtil::stringEndsWith(it->first, ".html"))) { + names.Start = it->first; + break; + } + } + } + + return names; +} + +const std::string CHMFileInfo::filePath() const { + return myFilePath; +} diff --git a/reader/src/formats/chm/CHMFile.h b/reader/src/formats/chm/CHMFile.h new file mode 100644 index 0000000..d98bd84 --- /dev/null +++ b/reader/src/formats/chm/CHMFile.h @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __CHMFILE_H__ +#define __CHMFILE_H__ + +#include <string> +#include <map> +#include <vector> + +#include <shared_ptr.h> +#include <ZLInputStream.h> + +class ZLFile; + +class LZXDecompressor; + +class CHMFileInfo { + +public: + struct FileNames { + std::string TOC; + std::string Index; + std::string Start; + std::string Home; + + bool empty() const; + }; + +public: + CHMFileInfo(const ZLFile &file); + bool init(ZLInputStream &stream); + // We assume that base exists and is already open + shared_ptr<ZLInputStream> entryStream(shared_ptr<ZLInputStream> base, const std::string &name) const; + // We assume that base exists and is already open + FileNames sectionNames(shared_ptr<ZLInputStream> base) const; + const std::string filePath() const; + +private: + bool moveToEntry(ZLInputStream &stream, const std::string &entryName); + +private: + unsigned long long mySection0Offset; + + struct RecordInfo { + RecordInfo(int section, int offset, int length) : Section(section), Offset(offset), Length(length) {} + std::size_t Section; + std::size_t Offset; + std::size_t Length; + }; + + typedef std::map<std::string,RecordInfo> RecordMap; + RecordMap myRecords; + std::vector<std::string> mySectionNames; + + struct SectionInfo { + std::size_t WindowSizeIndex; + std::size_t ResetInterval; + std::size_t Offset; + std::size_t Length; + std::size_t CompressedSize; + std::size_t UncompressedSize; + std::vector<std::size_t> ResetTable; + }; + std::vector<SectionInfo> mySectionInfos; + + const std::string myFilePath; + +private: + CHMFileInfo(const CHMFileInfo&); + const CHMFileInfo &operator = (const CHMFileInfo&); + +friend class CHMInputStream; +}; + +class CHMInputStream : public ZLInputStream { + +public: + CHMInputStream(shared_ptr<ZLInputStream> base, const CHMFileInfo::SectionInfo §ionInfo, std::size_t offset, std::size_t size); + ~CHMInputStream(); + + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + std::size_t do_read(char *buffer, std::size_t maxSize); + +private: + shared_ptr<ZLInputStream> myBase; + const CHMFileInfo::SectionInfo mySectionInfo; + std::size_t myBaseStartIndex; + std::size_t myBaseIndex; + std::size_t myBytesToSkip; + const std::size_t mySize; + + std::size_t myOffset; + bool myDoSkip; + + shared_ptr<LZXDecompressor> myDecompressor; + std::string myInData; + + unsigned char *myOutData; + std::size_t myOutDataOffset; + std::size_t myOutDataLength; +}; + +#endif /* __CHMFILE_H__ */ diff --git a/reader/src/formats/chm/CHMFileImage.cpp b/reader/src/formats/chm/CHMFileImage.cpp new file mode 100644 index 0000000..a2b58f0 --- /dev/null +++ b/reader/src/formats/chm/CHMFileImage.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> + +#include "CHMFileImage.h" + +CHMFileImage::CHMFileImage(shared_ptr<CHMFileInfo> info, const std::string &entry) : ZLStreamImage(ZLMimeType::IMAGE_AUTO, 0, 0), myInfo(info), myEntry(entry) { +} + +shared_ptr<ZLInputStream> CHMFileImage::inputStream() const { + shared_ptr<ZLInputStream> baseStream = ZLFile(myInfo->filePath()).inputStream(); + if (baseStream.isNull() || !baseStream->open()) { + return 0; + } + return myInfo->entryStream(baseStream, myEntry); +} diff --git a/reader/src/formats/chm/CHMFileImage.h b/reader/src/formats/chm/CHMFileImage.h new file mode 100644 index 0000000..bacb6aa --- /dev/null +++ b/reader/src/formats/chm/CHMFileImage.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __CHMFILEIMAGE_H__ +#define __CHMFILEIMAGE_H__ + +#include <ZLStreamImage.h> + +#include "CHMFile.h" + +class CHMFileImage : public ZLStreamImage { + +public: + CHMFileImage(shared_ptr<CHMFileInfo> info, const std::string &entry); + +private: + shared_ptr<ZLInputStream> inputStream() const; + +private: + shared_ptr<CHMFileInfo> myInfo; + std::string myEntry; +}; + +#endif /* __CHMFILEIMAGE_H__ */ diff --git a/reader/src/formats/chm/CHMPlugin.cpp b/reader/src/formats/chm/CHMPlugin.cpp new file mode 100644 index 0000000..9ea88e4 --- /dev/null +++ b/reader/src/formats/chm/CHMPlugin.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> +#include <ZLStringUtil.h> +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "CHMPlugin.h" +#include "CHMFile.h" +#include "CHMFileImage.h" +#include "CHMReferenceCollection.h" +#include "HHCReader.h" +#include "HHCReferenceCollector.h" +#include "../txt/PlainTextFormat.h" +#include "HtmlSectionReader.h" +#include "../util/MergedStream.h" +#include "../html/HtmlReaderStream.h" + +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +bool CHMPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "chm"; +} + +class CHMTextStream : public MergedStream { + +public: + CHMTextStream(CHMFileInfo &chmFile, shared_ptr<ZLInputStream> base); + +private: + void resetToStart(); + shared_ptr<ZLInputStream> nextStream(); + +private: + CHMFileInfo &myCHMFile; + shared_ptr<ZLInputStream> myBase; + std::vector<std::string> myEntryNames; + std::size_t myIndex; +}; + +CHMTextStream::CHMTextStream(CHMFileInfo &chmFile, shared_ptr<ZLInputStream> base) : myCHMFile(chmFile), myBase(base) { +} + +void CHMTextStream::resetToStart() { + myIndex = 0; + + if (!myEntryNames.empty()) { + return; + } + + CHMFileInfo::FileNames names = myCHMFile.sectionNames(myBase); + if (names.empty()) { + return; + } + + CHMReferenceCollection referenceCollection; + + referenceCollection.addReference(names.Start, false); + referenceCollection.addReference(names.Home, false); + + shared_ptr<ZLInputStream> tocStream = myCHMFile.entryStream(myBase, names.TOC); + if (!tocStream.isNull() && tocStream->open()) { + referenceCollection.setPrefix(names.TOC); + HHCReferenceCollector(referenceCollection).readDocument(*tocStream); + } + + while (referenceCollection.containsNonProcessedReferences()) { + myEntryNames.push_back(referenceCollection.nextReference()); + } +} + +shared_ptr<ZLInputStream> CHMTextStream::nextStream() { + while (myIndex < myEntryNames.size()) { + shared_ptr<ZLInputStream> stream = myCHMFile.entryStream(myBase, myEntryNames[myIndex++]); + if (!stream.isNull()) { + return new HtmlReaderStream(stream, 50000); + } + } + return 0; +} + +bool CHMPlugin::readMetaInfo(Book &book) const { + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + + CHMFileInfo chmFile(file); + if (!chmFile.init(*stream)) { + return false; + } + + CHMFileInfo::FileNames names = chmFile.sectionNames(stream); + if (names.empty()) { + return false; + } + + /* + shared_ptr<ZLInputStream> entryStream = chmFile.entryStream(stream, names.Start); + if (entryStream.isNull()) { + entryStream = chmFile.entryStream(stream, names.Home); + } + if (entryStream.isNull()) { + entryStream = chmFile.entryStream(stream, names.TOC); + } + / * + if (entryStream.isNull()) { + chmFile.entryStream(stream, names.Index); + } + * / + if (entryStream.isNull()) { + return false; + } + */ + + CHMTextStream textStream(chmFile, stream); + detectEncodingAndLanguage(book, textStream); + if (book.encoding().empty()) { + return false; + } + + return true; +} + +bool CHMPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +class CHMHyperlinkMatcher : public BookModel::HyperlinkMatcher { + +public: + BookModel::Label match(const std::map<std::string,BookModel::Label> &lMap, const std::string &id) const; +}; + +BookModel::Label CHMHyperlinkMatcher::match(const std::map<std::string,BookModel::Label> &lMap, const std::string &id) const { + std::map<std::string,BookModel::Label>::const_iterator it = lMap.find(id); + if (it != lMap.end()) { + return it->second; + } + std::size_t index = id.find('#'); + if (index != std::string::npos) { + it = lMap.find(id.substr(0, index)); + } + return (it != lMap.end()) ? it->second : BookModel::Label(0, -1); +} + +bool CHMPlugin::readModel(BookModel &model) const { + model.setHyperlinkMatcher(new CHMHyperlinkMatcher()); + + const Book &book = *model.book(); + const ZLFile &file = book.file(); + + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + + shared_ptr<CHMFileInfo> info = new CHMFileInfo(file); + if (!info->init(*stream)) { + return false; + } + + CHMFileInfo::FileNames names = info->sectionNames(stream); + if (names.empty()) { + return false; + } + + CHMReferenceCollection referenceCollection; + + referenceCollection.addReference(names.Start, false); + referenceCollection.addReference(names.Home, false); + + const std::string &encoding = book.encoding(); + + shared_ptr<ZLInputStream> tocStream = info->entryStream(stream, names.TOC); + HHCReader hhcReader(referenceCollection, model, encoding); + if (!tocStream.isNull() && tocStream->open()) { + referenceCollection.setPrefix(names.TOC); + hhcReader.readDocument(*tocStream); + } + + /* + if (!tocStream.isNull() && tocStream->open()) { + std::string buf; + buf.append(tocStream->sizeOfOpened(), '\0'); + tocStream->read((char*)buf.data(), buf.length()); + std::cerr << "[ " << names.TOC << " ]\n" << buf << "\n"; + } + */ + + int contentCounter = 0; + PlainTextFormat format(file); + HtmlSectionReader reader(model, format, encoding, info, referenceCollection); + while (referenceCollection.containsNonProcessedReferences()) { + const std::string fileName = referenceCollection.nextReference(); + if (ZLStringUtil::stringEndsWith(fileName, ".jpg") || + ZLStringUtil::stringEndsWith(fileName, ".gif")) { + std::string lowerCasedFileName = ZLUnicodeUtil::toLower(fileName); + BookReader bookReader(model); + bookReader.setMainTextModel(); + bookReader.addHyperlinkLabel(lowerCasedFileName); + bookReader.pushKind(REGULAR); + bookReader.beginParagraph(); + bookReader.addImageReference(lowerCasedFileName); + bookReader.addImage(fileName, new CHMFileImage(info, fileName)); + bookReader.endParagraph(); + bookReader.insertEndOfTextParagraph(); + } else { + shared_ptr<ZLInputStream> entryStream = info->entryStream(stream, fileName); + if (!entryStream.isNull() && entryStream->open()) { + /* + std::string buf; + buf.append(entryStream->sizeOfOpened(), '\0'); + entryStream->read((char*)buf.data(), buf.length()); + std::cerr << "[ " << fileName << " ]\n" << buf << "\n"; + entryStream->open(); + */ + reader.setSectionName(fileName); + reader.readDocument(*entryStream); + ++contentCounter; + } + } + } + if (contentCounter == 0) { + return false; + } + + hhcReader.setReferences(); + + + return true; +} diff --git a/reader/src/formats/chm/CHMPlugin.h b/reader/src/formats/chm/CHMPlugin.h new file mode 100644 index 0000000..0d38e62 --- /dev/null +++ b/reader/src/formats/chm/CHMPlugin.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __CHMPLUGIN_H__ +#define __CHMPLUGIN_H__ + +#include "../FormatPlugin.h" + +class CHMPlugin : public FormatPlugin { + +public: + CHMPlugin(); + ~CHMPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +inline CHMPlugin::CHMPlugin() {} +inline CHMPlugin::~CHMPlugin() {} +inline bool CHMPlugin::providesMetaInfo() const { return false; } + +#endif /* __CHMPLUGIN_H__ */ diff --git a/reader/src/formats/chm/CHMReferenceCollection.cpp b/reader/src/formats/chm/CHMReferenceCollection.cpp new file mode 100644 index 0000000..f29dd28 --- /dev/null +++ b/reader/src/formats/chm/CHMReferenceCollection.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "CHMReferenceCollection.h" +#include "../util/MiscUtil.h" + +std::string CHMReferenceCollection::fullReference(const std::string &prefix, std::string reference) { + reference = MiscUtil::decodeHtmlURL(reference); + if ((reference.length() > 0) && (reference[0] == '/')) { + return reference; + } + const int index = reference.rfind("::"); + if (index != -1) { + return reference.substr(index + 2); + } + + int counter = 0; + while (reference.substr(counter * 3, 3) == "../") { + ++counter; + } + + int slashIndex = prefix.length() - 1; + for (int i = 0; (i < counter) && (slashIndex > 0); ++i) { + slashIndex = prefix.rfind('/', slashIndex - 1); + } + return prefix.substr(0, slashIndex + 1) + reference.substr(counter * 3); +} + +CHMReferenceCollection::CHMReferenceCollection() : myPrefix("/") { +} + +const std::string &CHMReferenceCollection::addReference(const std::string &reference, bool doConvert) { + if (reference.empty()) { + return reference; + } + std::string fullRef = doConvert ? fullReference(myPrefix, reference) : MiscUtil::decodeHtmlURL(reference); + + const int index = fullRef.find('#'); + if (index == -1) { + fullRef = ZLUnicodeUtil::toLower(fullRef); + } else { + fullRef = ZLUnicodeUtil::toLower(fullRef.substr(0, index)); + } + std::set<std::string>::const_iterator it = myReferences.find(fullRef); + if (it != myReferences.end()) { + return *it; + } + + myReferences.insert(fullRef); + myReferenceQueue.push(fullRef); + return myReferenceQueue.back(); +} + +bool CHMReferenceCollection::containsNonProcessedReferences() const { + return !myReferenceQueue.empty(); +} + +const std::string CHMReferenceCollection::nextReference() { + if (myReferenceQueue.empty()) { + return ""; + } + const std::string front = myReferenceQueue.front(); + myReferenceQueue.pop(); + return front; +} + +void CHMReferenceCollection::setPrefix(const std::string &fileName) { + myPrefix = MiscUtil::decodeHtmlURL(fileName.substr(0, fileName.rfind('/') + 1)); +} + +const std::string &CHMReferenceCollection::prefix() const { + return myPrefix; +} diff --git a/reader/src/formats/chm/CHMReferenceCollection.h b/reader/src/formats/chm/CHMReferenceCollection.h new file mode 100644 index 0000000..6a53c45 --- /dev/null +++ b/reader/src/formats/chm/CHMReferenceCollection.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __CHMREFERENCECOLLECTION_H__ +#define __CHMREFERENCECOLLECTION_H__ + +#include <string> +#include <set> +#include <queue> + +class CHMReferenceCollection { + +public: + static std::string fullReference(const std::string &prefix, std::string reference); + +public: + CHMReferenceCollection(); + const std::string &addReference(const std::string &reference, bool doConvert); + bool containsNonProcessedReferences() const; + const std::string nextReference(); + void setPrefix(const std::string &fileName); + const std::string &prefix() const; + +private: + std::string myPrefix; + std::set<std::string> myReferences; + std::queue<std::string> myReferenceQueue; + +private: + CHMReferenceCollection(const CHMReferenceCollection&); + const CHMReferenceCollection &operator = (const CHMReferenceCollection&); +}; + +#endif /* __CHMREFERENCECOLLECTION_H__ */ diff --git a/reader/src/formats/chm/E8Decoder.cpp b/reader/src/formats/chm/E8Decoder.cpp new file mode 100644 index 0000000..53b9335 --- /dev/null +++ b/reader/src/formats/chm/E8Decoder.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "LZXDecompressor.h" + +void LZXDecompressor::E8Decoder::reset(unsigned int fileSize) { + myFileSize = fileSize; + myFramesCounter = 0; + myPosition = 0; +} + +void LZXDecompressor::E8Decoder::decode(unsigned char *buffer, const std::size_t size) { + if (myFramesCounter >= 32768) { + return; + } + ++myFramesCounter; + if (myFileSize == 0) { + return; + } + + myPosition += size; + + if (size <= 10) { + return; + } + + const unsigned char *end = buffer + size - 10; + + for (unsigned char *ptr = buffer; ptr < end; ) { + if (*ptr == 0xE8) { + int absoluteOffset = + ptr[1] + (ptr[2] << 8) + (ptr[3] << 16) + (ptr[4] << 24); + int relativeOffset = + (absoluteOffset >= 0) ? + absoluteOffset - (ptr - buffer) : absoluteOffset + myFileSize; + ptr[1] = (unsigned char)relativeOffset; + ptr[2] = (unsigned char)(relativeOffset >> 8); + ptr[3] = (unsigned char)(relativeOffset >> 16); + ptr[4] = (unsigned char)(relativeOffset >> 24); + ptr += 5; + } else { + ++ptr; + } + } +} diff --git a/reader/src/formats/chm/HHCReader.cpp b/reader/src/formats/chm/HHCReader.cpp new file mode 100644 index 0000000..4fd3105 --- /dev/null +++ b/reader/src/formats/chm/HHCReader.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "HHCReader.h" +#include "CHMReferenceCollection.h" + +HHCReader::HHCReader(CHMReferenceCollection &collection, BookModel &model, const std::string &encoding) : HtmlReader(encoding), myReferenceCollection(collection), myBookReader(model) { +} + +HHCReader::~HHCReader() { +} + +void HHCReader::startDocumentHandler() { + myBookReader.setMainTextModel(); +} + +void HHCReader::endDocumentHandler() { + std::string tmp0; + myText.swap(tmp0); + std::string tmp1; + myReference.swap(tmp1); +} + +static const std::string UL = "UL"; +static const std::string LI = "LI"; +static const std::string OBJECT = "OBJECT"; +static const std::string PARAM = "PARAM"; +static const std::string NAME = "NAME"; +static const std::string VALUE = "VALUE"; +static const std::string NAME_VALUE = "Name"; +static const std::string LOCAL_VALUE = "Local"; + +static bool isFirstChild = false; + +bool HHCReader::tagHandler(const HtmlTag &tag) { + if (tag.Start) { + if (tag.Name == UL) { + isFirstChild = true; + } else if (tag.Name == LI) { + } else if (tag.Name == OBJECT) { + myText.erase(); + myReference.erase(); + } else if (tag.Name == PARAM) { + std::string name; + std::string value; + for (std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin(); it != tag.Attributes.end(); ++it) { + if (it->Name == NAME) { + name = it->Value; + } else if (it->Name == VALUE) { + value = it->Value; + } + } + if (name == NAME_VALUE) { + myText = value; + } else if (name == LOCAL_VALUE) { + myReference = myReferenceCollection.addReference(value, true); + } + } + } else { + if (tag.Name == UL) { + myBookReader.endContentsParagraph(); + } else if (tag.Name == OBJECT) { + if (!myText.empty() || !myReference.empty()) { + if (!isFirstChild) { + myBookReader.endContentsParagraph(); + } else { + isFirstChild = false; + } + myBookReader.beginContentsParagraph(); + if (myText.empty()) { + myText = "..."; + } + myBookReader.addContentsData(myText.empty() ? "..." : myText); + myReferenceVector.push_back(ZLUnicodeUtil::toLower(myReference)); + } + } + } + return true; +} + +bool HHCReader::characterDataHandler(const char*, std::size_t, bool) { + return true; +} + +void HHCReader::setReferences() { + for (std::size_t i = 0; i < myReferenceVector.size(); ++i) { + myBookReader.setReference(i, myBookReader.model().label(myReferenceVector[i]).ParagraphNumber); + } +} diff --git a/reader/src/formats/chm/HHCReader.h b/reader/src/formats/chm/HHCReader.h new file mode 100644 index 0000000..c0e4cef --- /dev/null +++ b/reader/src/formats/chm/HHCReader.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HHCREADER_H__ +#define __HHCREADER_H__ + +#include <vector> + +#include "../html/HtmlReader.h" +#include "../../bookmodel/BookModel.h" +#include "../../bookmodel/BookReader.h" + +class CHMReferenceCollection; + +class HHCReader : public HtmlReader { + +public: + HHCReader(CHMReferenceCollection &collection, BookModel &model, const std::string &encoding); + ~HHCReader(); + + void setReferences(); + +private: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char*, std::size_t, bool); + +private: + CHMReferenceCollection &myReferenceCollection; + + std::string myText; + std::string myReference; + + BookReader myBookReader; + + std::vector<std::string> myReferenceVector; +}; + +#endif /* __HHCREADER_H__ */ diff --git a/reader/src/formats/chm/HHCReferenceCollector.cpp b/reader/src/formats/chm/HHCReferenceCollector.cpp new file mode 100644 index 0000000..6abcef2 --- /dev/null +++ b/reader/src/formats/chm/HHCReferenceCollector.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "HHCReferenceCollector.h" +#include "CHMReferenceCollection.h" + +HHCReferenceCollector::HHCReferenceCollector(CHMReferenceCollection &collection) : HtmlReader("US-ASCII"), myReferenceCollection(collection) { +} + +void HHCReferenceCollector::startDocumentHandler() { +} + +void HHCReferenceCollector::endDocumentHandler() { +} + +static const std::string PARAM = "PARAM"; +static const std::string NAME = "NAME"; +static const std::string VALUE = "VALUE"; +static const std::string NAME_VALUE = "Name"; +static const std::string LOCAL_VALUE = "Local"; + +bool HHCReferenceCollector::tagHandler(const HtmlTag &tag) { + if (tag.Start) { + if (tag.Name == PARAM) { + std::string name; + std::string value; + for (std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin(); it != tag.Attributes.end(); ++it) { + if (it->Name == NAME) { + name = it->Value; + } else if (it->Name == VALUE) { + value = it->Value; + } + } + if (name == LOCAL_VALUE) { + myReferenceCollection.addReference(value, true); + } + } + } + return true; +} + +bool HHCReferenceCollector::characterDataHandler(const char*, std::size_t, bool) { + return true; +} diff --git a/reader/src/formats/chm/HHCReferenceCollector.h b/reader/src/formats/chm/HHCReferenceCollector.h new file mode 100644 index 0000000..20e58d1 --- /dev/null +++ b/reader/src/formats/chm/HHCReferenceCollector.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HHCREFERENCECOLLECTOR_H__ +#define __HHCREFERENCECOLLECTOR_H__ + +#include <vector> + +#include "../html/HtmlReader.h" + +class CHMReferenceCollection; + +class HHCReferenceCollector : public HtmlReader { + +public: + HHCReferenceCollector(CHMReferenceCollection &collection); + +private: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char*, std::size_t, bool); + +private: + CHMReferenceCollection &myReferenceCollection; +}; + +#endif /* __HHCREFERENCECOLLECTOR_H__ */ diff --git a/reader/src/formats/chm/HtmlSectionReader.cpp b/reader/src/formats/chm/HtmlSectionReader.cpp new file mode 100644 index 0000000..9973e14 --- /dev/null +++ b/reader/src/formats/chm/HtmlSectionReader.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "HtmlSectionReader.h" +#include "CHMReferenceCollection.h" +#include "CHMFileImage.h" +#include "../util/MiscUtil.h" +#include "../html/HtmlTagActions.h" + +class HtmlSectionHrefTagAction : public HtmlHrefTagAction { + +public: + HtmlSectionHrefTagAction(HtmlSectionReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlSectionImageTagAction : public HtmlTagAction { + +public: + HtmlSectionImageTagAction(HtmlSectionReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +shared_ptr<HtmlTagAction> HtmlSectionReader::createAction(const std::string &tag) { + if (tag == "IMG") { + return new HtmlSectionImageTagAction(*this); + } else if (tag == "A") { + return new HtmlSectionHrefTagAction(*this); + } + return HtmlBookReader::createAction(tag); +} + +HtmlSectionReader::HtmlSectionReader(BookModel &model, const PlainTextFormat &format, const std::string &encoding, shared_ptr<CHMFileInfo> info, CHMReferenceCollection &collection) : HtmlBookReader("", model, format, encoding), myInfo(info), myReferenceCollection(collection) { + setBuildTableOfContent(false); +} + +void HtmlSectionReader::setSectionName(const std::string §ionName) { + myCurrentSectionName = ZLUnicodeUtil::toLower(sectionName); + myReferenceCollection.setPrefix(myCurrentSectionName); +} + +void HtmlSectionReader::startDocumentHandler() { + HtmlBookReader::startDocumentHandler(); + myBookReader.addHyperlinkLabel(ZLUnicodeUtil::toLower(myCurrentSectionName)); +} + +void HtmlSectionReader::endDocumentHandler() { + HtmlBookReader::endDocumentHandler(); + myBookReader.insertEndOfTextParagraph(); +} + +HtmlSectionHrefTagAction::HtmlSectionHrefTagAction(HtmlSectionReader &reader) : HtmlHrefTagAction(reader) { +} + +void HtmlSectionHrefTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + HtmlSectionReader &reader = (HtmlSectionReader&)myReader; + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "NAME") { + bookReader().addHyperlinkLabel(ZLUnicodeUtil::toLower(reader.myCurrentSectionName + '#' + tag.Attributes[i].Value)); + } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) { + const std::string &value = tag.Attributes[i].Value; + if (!value.empty()) { + FBTextKind referenceType = MiscUtil::referenceType(value); + if (referenceType != INTERNAL_HYPERLINK) { + bookReader().addHyperlinkControl(referenceType, value); + setHyperlinkType(referenceType); + } else { + const int index = value.find('#'); + std::string sectionName = (index == -1) ? value : value.substr(0, index); + sectionName = ZLUnicodeUtil::toLower(MiscUtil::decodeHtmlURL(sectionName)); + if (sectionName.empty()) { + sectionName = reader.myCurrentSectionName; + } else { + sectionName = reader.myReferenceCollection.addReference(sectionName, true); + } + bookReader().addHyperlinkControl( + INTERNAL_HYPERLINK, ZLUnicodeUtil::toLower((index == -1) ? sectionName : (sectionName + value.substr(index))) + ); + setHyperlinkType(INTERNAL_HYPERLINK); + } + } + } + } + } else if (hyperlinkType() != REGULAR) { + bookReader().addControl(hyperlinkType(), false); + setHyperlinkType(REGULAR); + } +} + +HtmlSectionImageTagAction::HtmlSectionImageTagAction(HtmlSectionReader &reader) : HtmlTagAction(reader) { +} + +void HtmlSectionImageTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + //bookReader().endParagraph(); + HtmlSectionReader &reader = (HtmlSectionReader&)myReader; + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "SRC") { + std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value); + fileName = CHMReferenceCollection::fullReference(reader.myReferenceCollection.prefix(), fileName); + fileName = ZLUnicodeUtil::toLower(fileName); + bookReader().addImageReference(fileName); + bookReader().addImage(fileName, new CHMFileImage(reader.myInfo, fileName)); + break; + } + } + //bookReader().beginParagraph(); + } +} diff --git a/reader/src/formats/chm/HtmlSectionReader.h b/reader/src/formats/chm/HtmlSectionReader.h new file mode 100644 index 0000000..424c178 --- /dev/null +++ b/reader/src/formats/chm/HtmlSectionReader.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLSECTIONREADER_H__ +#define __HTMLSECTIONREADER_H__ + +#include "../html/HtmlBookReader.h" +#include "CHMFile.h" + +class CHMReferenceCollection; + +class HtmlSectionReader : public HtmlBookReader { + +public: + HtmlSectionReader(BookModel &model, const PlainTextFormat &format, const std::string &encoding, shared_ptr<CHMFileInfo> info, CHMReferenceCollection &collection); + void setSectionName(const std::string §ionName); + +private: + void startDocumentHandler(); + void endDocumentHandler(); + +private: + shared_ptr<HtmlTagAction> createAction(const std::string &tag); + +private: + shared_ptr<CHMFileInfo> myInfo; + CHMReferenceCollection &myReferenceCollection; + std::string myCurrentSectionName; + +friend class HtmlSectionHrefTagAction; +friend class HtmlSectionImageTagAction; +}; + +#endif /* __HTMLSECTIONREADER_H__ */ diff --git a/reader/src/formats/chm/HuffmanDecoder.cpp b/reader/src/formats/chm/HuffmanDecoder.cpp new file mode 100644 index 0000000..db8718f --- /dev/null +++ b/reader/src/formats/chm/HuffmanDecoder.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> + +#include "HuffmanDecoder.h" + +HuffmanDecoder::HuffmanDecoder() : myMaxBitsNumber(0) { +} + +void HuffmanDecoder::reset() { + CodeLengths.clear(); +} + +bool HuffmanDecoder::buildTable() { + myMaxBitsNumber = 0; + for (unsigned short symbol = 0; symbol < CodeLengths.size(); symbol++) { + myMaxBitsNumber = std::max(CodeLengths[symbol], myMaxBitsNumber); + } + if (myMaxBitsNumber > 16) { + return false; + } + + unsigned int tableSize = 1 << myMaxBitsNumber; + mySymbols.clear(); + mySymbols.reserve(tableSize); + + for (unsigned char i = 1; i <= myMaxBitsNumber; ++i) { + for (unsigned short symbol = 0; symbol < CodeLengths.size(); symbol++) { + if (CodeLengths[symbol] == i) { + mySymbols.insert(mySymbols.end(), 1 << (myMaxBitsNumber - i), symbol); + if (mySymbols.size() > tableSize) { + return false; + } + } + } + } + + if (mySymbols.size() < tableSize) { + mySymbols.insert(mySymbols.end(), tableSize - mySymbols.size(), 0); + } + + return true; +} diff --git a/reader/src/formats/chm/HuffmanDecoder.h b/reader/src/formats/chm/HuffmanDecoder.h new file mode 100644 index 0000000..bd9f700 --- /dev/null +++ b/reader/src/formats/chm/HuffmanDecoder.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HUFFMANDECODER_H__ +#define __HUFFMANDECODER_H__ + +#include <vector> + +#include "BitStream.h" + +class HuffmanDecoder { + +public: + HuffmanDecoder(); + + bool buildTable(); + void reset(); + + unsigned int getSymbol(BitStream &stream) const; + +private: + unsigned char myMaxBitsNumber; + std::vector<unsigned short> mySymbols; + std::vector<unsigned char> CodeLengths; + HuffmanDecoder(const HuffmanDecoder&); + const HuffmanDecoder &operator = (const HuffmanDecoder&); + +friend class LZXDecompressor; +}; + +inline unsigned int HuffmanDecoder::getSymbol(BitStream &stream) const { + unsigned int symbol = mySymbols[stream.peek(myMaxBitsNumber)]; + stream.remove(CodeLengths[symbol]); + return symbol; +} + +#endif /* __HUFFMANDECODER_H__ */ diff --git a/reader/src/formats/chm/LZXDecompressor.cpp b/reader/src/formats/chm/LZXDecompressor.cpp new file mode 100644 index 0000000..38b4311 --- /dev/null +++ b/reader/src/formats/chm/LZXDecompressor.cpp @@ -0,0 +1,287 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include "LZXDecompressor.h" + +static unsigned int slotNumber(int windowSizeIndex) { + if (windowSizeIndex == 20) { + return 42; + } else if (windowSizeIndex == 21) { + return 50; + } else { + return 2 * windowSizeIndex; + } +} + +LZXDecompressor::LZXDecompressor(int windowSizeIndex) : myWindow(1 << windowSizeIndex, 0), mySlotNumber(slotNumber(windowSizeIndex)) { + reset(); +} + +void LZXDecompressor::reset() { + myCurrentBlockType = UNKNOWNN; + myReadHeader = true; + + myState.WindowIterator = myWindow.begin(); + myState.R0 = 1; + myState.R1 = 1; + myState.R2 = 1; + + myMainTree.reset(); + myLengthTree.reset(); + + myBlockBytesLeft = 0; + + myE8Decoder.reset(0); +} + +static bool fill(std::vector<unsigned char> &data, std::vector<unsigned char>::iterator &it, int num, unsigned char value) { + if (data.end() - it < num) { + return false; + } + std::vector<unsigned char>::iterator end = it + num; + while (it != end) { + *it++ = value; + } + return true; +} + +bool LZXDecompressor::readLengths(HuffmanDecoder &decoder, std::size_t from, std::size_t size) { + HuffmanDecoder preTree; + preTree.CodeLengths.reserve(20); + for (int i = 0; i < 20; i++) { + preTree.CodeLengths.push_back(myBitStream.get(4)); + } + if (!preTree.buildTable()) { + return false; + } + + std::vector<unsigned char> &lengths = decoder.CodeLengths; + if (lengths.size() < from + size) { + lengths.insert(lengths.end(), from + size - lengths.size(), 0); + } + std::vector<unsigned char>::iterator start = lengths.begin() + from; + std::vector<unsigned char>::iterator end = start + size; + for (std::vector<unsigned char>::iterator it = start; it != end; ) { + int z = preTree.getSymbol(myBitStream); + if (z == 17) { + if (!fill(lengths, it, myBitStream.get(4) + 4, 0)) { + return false; + } + } else if (z == 18) { + if (!fill(lengths, it, myBitStream.get(5) + 20, 0)) { + return false; + } + } else if (z == 19) { + unsigned int num = myBitStream.get(1) + 4; + z = *it - preTree.getSymbol(myBitStream); + if (!fill(lengths, it, num, (z < 0) ? z + 17 : z)) { + return false; + } + } else { + z = *it - z; + *it++ = (z < 0) ? z + 17 : z; + } + } + + return true; +} + +static const unsigned int basePosition[51] = { + 0, 1, 2, 3, 4, 6, 8, 12, + 16, 24, 32, 48, 64, 96, 128, 192, + 256, 384, 512, 768, 1024, 1536, 2048, 3072, + 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, + 65536, 98304, 131072, 196608, 262144, 393216, 524288, 655360, + 786432, 917504, 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, + 1835008, 1966080, 2097152 +}; + +bool LZXDecompressor::decodeBytes(DecodingState &state, std::size_t bytesToDecode) { + if (myCurrentBlockType == UNCOMPRESSED) { + if (!myBitStream.getBytesDirect(&*state.WindowIterator, bytesToDecode)) { + return false; + } + state.WindowIterator += bytesToDecode; + return true; + } + + while (bytesToDecode > 0) { + int symbol = myMainTree.getSymbol(myBitStream); + if (symbol < 256) { + *state.WindowIterator++ = symbol; + --bytesToDecode; + continue; + } + + std::size_t length = symbol % 8; + if (length == 7) { + length += myLengthTree.getSymbol(myBitStream); + } + length += 2; + if (length > bytesToDecode) { + return false; + } + + std::size_t offset = (symbol - 256) / 8; + switch (offset) { + case 0: + offset = state.R0; + break; + case 1: + offset = state.R1; + state.R1 = state.R0; + state.R0 = offset; + break; + case 2: + offset = state.R2; + state.R2 = state.R0; + state.R0 = offset; + break; + default: + if ((myCurrentBlockType == VERBATIM) && (offset == 3)) { + offset = 1; + } else { + if (offset > 50) { + return false; + } + const int positionFooterBits = std::max(0, std::min((int)offset / 2 - 1, 17)); + offset = basePosition[offset] - 2; + if ((myCurrentBlockType == VERBATIM) || (positionFooterBits == 1) || (positionFooterBits == 2)) { + offset += myBitStream.get(positionFooterBits); + } else if (positionFooterBits == 3) { + offset += myAlignedOffsetTree.getSymbol(myBitStream); + } else if (positionFooterBits > 3) { + offset += 8 * myBitStream.get(positionFooterBits - 3); + offset += myAlignedOffsetTree.getSymbol(myBitStream); + } else { + offset = 1; + } + } + state.R2 = state.R1; + state.R1 = state.R0; + state.R0 = offset; + break; + } + + if ((state.WindowIterator - myWindow.begin()) + myWindow.size() < offset) { + return false; + } + if (myWindow.size() >= offset + (myWindow.end() - state.WindowIterator)) { + offset += myWindow.size(); + if (myWindow.size() >= offset + (myWindow.end() - state.WindowIterator)) { + return false; + } + } + std::vector<unsigned char>::iterator srcIt = state.WindowIterator + (myWindow.size() - offset); + for (std::size_t i = 0; i < length; ++i) { + if (srcIt == myWindow.end()) { + srcIt -= myWindow.size(); + } + *state.WindowIterator++ = *srcIt++; + } + bytesToDecode -= length; + } + return true; +} + +bool LZXDecompressor::decompress(const std::string &data, unsigned char *outBuffer, const std::size_t outSize) { + myBitStream.setData(data); + + if (myReadHeader) { + if (myBitStream.get(1) == 1) { + myE8Decoder.reset(myBitStream.get(32)); + } + myReadHeader = false; + } + + DecodingState state = myState; + + for (std::size_t bytesToWrite = outSize; bytesToWrite > 0; ) { + if (myBlockBytesLeft == 0) { + if (myCurrentBlockType == UNCOMPRESSED) { + if (myBlockSize & 1) { + myBitStream.remove(8); + } + myBitStream.reset(); + } + + myCurrentBlockType = (BlockType)myBitStream.get(3); + myBlockSize = myBitStream.get(24); + myBlockBytesLeft = myBlockSize; + + switch (myCurrentBlockType) { + case UNCOMPRESSED: + myBitStream.reset(); + state.R0 = myBitStream.get4BytesDirect(); + state.R1 = myBitStream.get4BytesDirect(); + state.R2 = myBitStream.get4BytesDirect(); + break; + + case ALIGNED: + myAlignedOffsetTree.CodeLengths.clear(); + for (int i = 0; i < 8; i++) { + myAlignedOffsetTree.CodeLengths.push_back(myBitStream.get(3)); + } + if (!myAlignedOffsetTree.buildTable()) { + return false; + } + // no break; it's not a mistake + + case VERBATIM: + if (!readLengths(myMainTree, 0, 256) || + !readLengths(myMainTree, 256, 8 * mySlotNumber) || + !readLengths(myLengthTree, 0, 249) || + !myMainTree.buildTable() || + !myLengthTree.buildTable()) { + return false; + } + break; + + default: + return false; + } + } + + while ((myBlockBytesLeft > 0) && (bytesToWrite > 0)) { + std::size_t bytesToDecode = std::min(myBlockBytesLeft, bytesToWrite); + if (state.WindowIterator + bytesToDecode > myWindow.end()) { + return false; + } + + if (!decodeBytes(state, bytesToDecode)) { + return false; + } + + bytesToWrite -= bytesToDecode; + myBlockBytesLeft -= bytesToDecode; + } + } + + std::vector<unsigned char>::iterator jt = + (state.WindowIterator != myWindow.begin()) ? state.WindowIterator : myWindow.end(); + std::memcpy(outBuffer, &*(jt - outSize), outSize); + + myState = state; + + myE8Decoder.decode(outBuffer, outSize); + + return true; +} diff --git a/reader/src/formats/chm/LZXDecompressor.h b/reader/src/formats/chm/LZXDecompressor.h new file mode 100644 index 0000000..dac9e1f --- /dev/null +++ b/reader/src/formats/chm/LZXDecompressor.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __LZXDECOMPRESSOR_H__ +#define __LZXDECOMPRESSOR_H__ + +#include <string> +#include <vector> + +#include "BitStream.h" +#include "HuffmanDecoder.h" + +class LZXDecompressor { + +public: + LZXDecompressor(int windowSizeIndex); + void reset(); + + bool decompress(const std::string &data, unsigned char *outBuffer, const std::size_t outSize); + +private: + struct DecodingState { + std::vector<unsigned char>::iterator WindowIterator; + unsigned int R0; + unsigned int R1; + unsigned int R2; + }; + + bool readLengths(HuffmanDecoder &decoder, std::size_t from, std::size_t size); + bool decodeBytes(DecodingState &state, std::size_t bytesToDecode); + +private: + enum BlockType { + UNKNOWNN = 0, + VERBATIM = 1, + ALIGNED = 2, + UNCOMPRESSED = 3 + }; + + BlockType myCurrentBlockType; + bool myReadHeader; + + std::vector<unsigned char> myWindow; + + DecodingState myState; + + std::size_t myBlockSize; + std::size_t myBlockBytesLeft; + + const unsigned int mySlotNumber; + HuffmanDecoder myMainTree; + HuffmanDecoder myLengthTree; + HuffmanDecoder myAlignedOffsetTree; + + BitStream myBitStream; + + class E8Decoder { + + public: + void reset(unsigned int fileSize); + void decode(unsigned char *buffer, const std::size_t size); + + private: + unsigned int myFramesCounter; + unsigned int myFileSize; + unsigned int myPosition; + }; + + E8Decoder myE8Decoder; +}; + +#endif /* __LZXDECOMPRESSOR_H__ */ diff --git a/reader/src/formats/css/StyleSheetParser.cpp b/reader/src/formats/css/StyleSheetParser.cpp new file mode 100644 index 0000000..33dc900 --- /dev/null +++ b/reader/src/formats/css/StyleSheetParser.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> +#include <cstring> + +#include <ZLStringUtil.h> +#include <ZLInputStream.h> +#include <ZLLogger.h> + +#include "StyleSheetParser.h" + +StyleSheetTableParser::StyleSheetTableParser(StyleSheetTable &table) : myTable(table) { + //ZLLogger::Instance().registerClass("CSS"); +} + +void StyleSheetTableParser::storeData(const std::string &selector, const StyleSheetTable::AttributeMap &map) { + std::string s = selector; + ZLStringUtil::stripWhiteSpaces(s); + + if (s.empty()) { + return; + } + + if (s[0] == '@') { + processAtRule(s, map); + return; + } + + const std::vector<std::string> ids = ZLStringUtil::split(s, ","); + for (std::vector<std::string>::const_iterator it = ids.begin(); it != ids.end(); ++it) { + std::string id = *it; + ZLStringUtil::stripWhiteSpaces(id); + if (!id.empty()) { + const std::size_t index = id.find('.'); + if (index == std::string::npos) { + myTable.addMap(id, std::string(), map); + } else { + myTable.addMap(id.substr(0, index), id.substr(index + 1), map); + } + } + } +} + +void StyleSheetTableParser::processAtRule(const std::string &name, const StyleSheetTable::AttributeMap &map) { + (void)map; + if (name == "@font-face") { + } +} + +shared_ptr<ZLTextStyleEntry> StyleSheetSingleStyleParser::parseString(const char *text) { + myReadState = WAITING_FOR_ATTRIBUTE; + parse(text, std::strlen(text), true); + shared_ptr<ZLTextStyleEntry> control = StyleSheetTable::createControl(myMap); + reset(); + return control; +} + +StyleSheetParser::StyleSheetParser() { + reset(); +} + +StyleSheetParser::~StyleSheetParser() { +} + +void StyleSheetParser::reset() { + myWord.erase(); + myAttributeName.erase(); + myReadState = WAITING_FOR_SELECTOR; + myInsideComment = false; + mySelectorString.erase(); + myMap.clear(); +} + +void StyleSheetParser::parse(ZLInputStream &stream) { + if (stream.open()) { + char *buffer = new char[1024]; + while (true) { + int len = stream.read(buffer, 1024); + if (len == 0) { + break; + } + parse(buffer, len); + } + delete[] buffer; + stream.close(); + } +} + +void StyleSheetParser::parse(const char *text, int len, bool final) { + const char *start = text; + const char *end = text + len; + for (const char *ptr = start; ptr != end; ++ptr) { + if (std::isspace(*ptr)) { + if (start != ptr) { + myWord.append(start, ptr - start); + } + processWord(myWord); + myWord.erase(); + start = ptr + 1; + } else if (isControlSymbol(*ptr)) { + if (start != ptr) { + myWord.append(start, ptr - start); + } + processWord(myWord); + myWord.erase(); + processControl(*ptr); + start = ptr + 1; + } + } + if (start < end) { + myWord.append(start, end - start); + if (final) { + processWord(myWord); + myWord.erase(); + } + } +} + +bool StyleSheetParser::isControlSymbol(const char symbol) { + switch (myReadState) { + default: + case WAITING_FOR_SELECTOR: + return false; + case SELECTOR: + return symbol == '{' || symbol == ';'; + case WAITING_FOR_ATTRIBUTE: + return symbol == '}' || symbol == ':'; + case ATTRIBUTE_NAME: + return symbol == ':'; + case ATTRIBUTE_VALUE: + return symbol == '}' || symbol == ';'; + } +} + +void StyleSheetParser::storeData(const std::string&, const StyleSheetTable::AttributeMap&) { +} + +void StyleSheetParser::processAtRule(const std::string&, const StyleSheetTable::AttributeMap&) { +} + +void StyleSheetParser::processControl(const char control) { + switch (myReadState) { + case WAITING_FOR_SELECTOR: + break; + case SELECTOR: + switch (control) { + case '{': + myReadState = WAITING_FOR_ATTRIBUTE; + break; + case ';': + myReadState = WAITING_FOR_SELECTOR; + mySelectorString.erase(); + break; + } + break; + case WAITING_FOR_ATTRIBUTE: + if (control == '}') { + myReadState = WAITING_FOR_SELECTOR; + storeData(mySelectorString, myMap); + mySelectorString.erase(); + myMap.clear(); + } + break; + case ATTRIBUTE_NAME: + if (control == ':') { + myReadState = ATTRIBUTE_VALUE; + } + break; + case ATTRIBUTE_VALUE: + if (control == ';') { + myReadState = WAITING_FOR_ATTRIBUTE; + } else if (control == '}') { + myReadState = WAITING_FOR_SELECTOR; + storeData(mySelectorString, myMap); + mySelectorString.erase(); + myMap.clear(); + } + break; + } +} + +void StyleSheetParser::processWord(std::string &word) { + while (!word.empty()) { + int index = word.find(myInsideComment ? "*/" : "/*"); + if (!myInsideComment) { + if (index == -1) { + processWordWithoutComments(word); + } else if (index > 0) { + processWordWithoutComments(word.substr(0, index)); + } + } + if (index == -1) { + break; + } + myInsideComment = !myInsideComment; + word.erase(0, index + 2); + } +} + +void StyleSheetParser::processWordWithoutComments(const std::string &word) { + switch (myReadState) { + case WAITING_FOR_SELECTOR: + myReadState = SELECTOR; + mySelectorString = word; + break; + case SELECTOR: + mySelectorString += ' ' + word; + break; + case WAITING_FOR_ATTRIBUTE: + myReadState = ATTRIBUTE_NAME; + // go through + case ATTRIBUTE_NAME: + myAttributeName = word; + myMap[myAttributeName].clear(); + break; + case ATTRIBUTE_VALUE: + { + const std::size_t l = word.length(); + if (l >= 2 && (word[0] == '"' || word[0] == '\'') && word[0] == word[l - 1]) { + myMap[myAttributeName].push_back(word.substr(1, l - 2)); + } else { + myMap[myAttributeName].push_back(word); + } + break; + } + } +} diff --git a/reader/src/formats/css/StyleSheetParser.h b/reader/src/formats/css/StyleSheetParser.h new file mode 100644 index 0000000..8949823 --- /dev/null +++ b/reader/src/formats/css/StyleSheetParser.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __STYLESHEETPARSER_H__ +#define __STYLESHEETPARSER_H__ + +#include "StyleSheetTable.h" + +class ZLInputStream; + +class StyleSheetParser { + +protected: + StyleSheetParser(); + +public: + virtual ~StyleSheetParser(); + void reset(); + void parse(ZLInputStream &stream); + void parse(const char *text, int len, bool final = false); + +protected: + virtual void storeData(const std::string &selector, const StyleSheetTable::AttributeMap &map); + virtual void processAtRule(const std::string &name, const StyleSheetTable::AttributeMap &map); + +private: + bool isControlSymbol(const char symbol); + void processWord(std::string &word); + void processWordWithoutComments(const std::string &word); + void processControl(const char control); + +private: + std::string myWord; + std::string myAttributeName; + enum { + WAITING_FOR_SELECTOR, + SELECTOR, + WAITING_FOR_ATTRIBUTE, + ATTRIBUTE_NAME, + ATTRIBUTE_VALUE, + } myReadState; + bool myInsideComment; + std::string mySelectorString; + StyleSheetTable::AttributeMap myMap; + +friend class StyleSheetSingleStyleParser; +}; + +class StyleSheetTableParser : public StyleSheetParser { + +public: + StyleSheetTableParser(StyleSheetTable &table); + +private: + void storeData(const std::string &selector, const StyleSheetTable::AttributeMap &map); + void processAtRule(const std::string &name, const StyleSheetTable::AttributeMap &map); + +private: + StyleSheetTable &myTable; +}; + +class StyleSheetSingleStyleParser : public StyleSheetParser { + +public: + shared_ptr<ZLTextStyleEntry> parseString(const char *text); +}; + +#endif /* __STYLESHEETPARSER_H__ */ diff --git a/reader/src/formats/css/StyleSheetTable.cpp b/reader/src/formats/css/StyleSheetTable.cpp new file mode 100644 index 0000000..fe45a85 --- /dev/null +++ b/reader/src/formats/css/StyleSheetTable.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> + +#include <ZLStringUtil.h> +#include <ZLLogger.h> + +#include "StyleSheetTable.h" + +bool StyleSheetTable::isEmpty() const { + return myControlMap.empty() && myPageBreakBeforeMap.empty() && myPageBreakAfterMap.empty(); +} + +void StyleSheetTable::addMap(const std::string &tag, const std::string &aClass, const AttributeMap &map) { + if ((!tag.empty() || !aClass.empty()) && !map.empty()) { + Key key(tag, aClass); + myControlMap[key] = createControl(map); + const std::vector<std::string> &pbb = values(map, "page-break-before"); + if (!pbb.empty()) { + if ((pbb[0] == "always") || + (pbb[0] == "left") || + (pbb[0] == "right")) { + myPageBreakBeforeMap[key] = true; + } else if (pbb[0] == "avoid") { + myPageBreakBeforeMap[key] = false; + } + } + const std::vector<std::string> &pba = values(map, "page-break-after"); + if (!pba.empty()) { + if ((pba[0] == "always") || + (pba[0] == "left") || + (pba[0] == "right")) { + myPageBreakAfterMap[key] = true; + } else if (pba[0] == "avoid") { + myPageBreakAfterMap[key] = false; + } + } + } +} + +static bool parseLength(const std::string &toParse, short &size, ZLTextStyleEntry::SizeUnit &unit) { + if (ZLStringUtil::stringEndsWith(toParse, "%")) { + unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT; + size = std::atoi(toParse.c_str()); + return true; + } else if (ZLStringUtil::stringEndsWith(toParse, "em")) { + unit = ZLTextStyleEntry::SIZE_UNIT_EM_100; + size = (short)(100 * ZLStringUtil::stringToDouble(toParse, 0)); + return true; + } else if (ZLStringUtil::stringEndsWith(toParse, "ex")) { + unit = ZLTextStyleEntry::SIZE_UNIT_EX_100; + size = (short)(100 * ZLStringUtil::stringToDouble(toParse, 0)); + return true; + } else if (ZLStringUtil::stringEndsWith(toParse, "px")) { + unit = ZLTextStyleEntry::SIZE_UNIT_PIXEL; + size = std::atoi(toParse.c_str()); + return true; + } else if (ZLStringUtil::stringEndsWith(toParse, "pt")) { + unit = ZLTextStyleEntry::SIZE_UNIT_POINT; + size = std::atoi(toParse.c_str()); + return true; + } + return false; +} + +void StyleSheetTable::setLength(ZLTextStyleEntry &entry, ZLTextStyleEntry::Feature featureId, const AttributeMap &map, const std::string &attributeName) { + StyleSheetTable::AttributeMap::const_iterator it = map.find(attributeName); + if (it == map.end()) { + return; + } + const std::vector<std::string> &values = it->second; + if (!values.empty() && !values[0].empty()) { + short size; + ZLTextStyleEntry::SizeUnit unit; + if (parseLength(values[0], size, unit)) { + entry.setLength(featureId, size, unit); + } + } +} + +bool StyleSheetTable::doBreakBefore(const std::string &tag, const std::string &aClass) const { + std::map<Key,bool>::const_iterator it = myPageBreakBeforeMap.find(Key(tag, aClass)); + if (it != myPageBreakBeforeMap.end()) { + return it->second; + } + + it = myPageBreakBeforeMap.find(Key("", aClass)); + if (it != myPageBreakBeforeMap.end()) { + return it->second; + } + + it = myPageBreakBeforeMap.find(Key(tag, "")); + if (it != myPageBreakBeforeMap.end()) { + return it->second; + } + + return false; +} + +bool StyleSheetTable::doBreakAfter(const std::string &tag, const std::string &aClass) const { + std::map<Key,bool>::const_iterator it = myPageBreakAfterMap.find(Key(tag, aClass)); + if (it != myPageBreakAfterMap.end()) { + return it->second; + } + + it = myPageBreakAfterMap.find(Key("", aClass)); + if (it != myPageBreakAfterMap.end()) { + return it->second; + } + + it = myPageBreakAfterMap.find(Key(tag, "")); + if (it != myPageBreakAfterMap.end()) { + return it->second; + } + + return false; +} + +shared_ptr<ZLTextStyleEntry> StyleSheetTable::control(const std::string &tag, const std::string &aClass) const { + std::map<Key,shared_ptr<ZLTextStyleEntry> >::const_iterator it = + myControlMap.find(Key(tag, aClass)); + return (it != myControlMap.end()) ? it->second : 0; +} + +const std::vector<std::string> &StyleSheetTable::values(const AttributeMap &map, const std::string &name) { + const AttributeMap::const_iterator it = map.find(name); + if (it != map.end()) { + return it->second; + } + static const std::vector<std::string> emptyVector; + return emptyVector; +} + +shared_ptr<ZLTextStyleEntry> StyleSheetTable::createControl(const AttributeMap &styles) { + shared_ptr<ZLTextStyleEntry> entry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_CSS_ENTRY); + + const std::vector<std::string> &alignment = values(styles, "text-align"); + if (!alignment.empty()) { + if (alignment[0] == "justify") { + entry->setAlignmentType(ALIGN_JUSTIFY); + } else if (alignment[0] == "left") { + entry->setAlignmentType(ALIGN_LEFT); + } else if (alignment[0] == "right") { + entry->setAlignmentType(ALIGN_RIGHT); + } else if (alignment[0] == "center") { + entry->setAlignmentType(ALIGN_CENTER); + } + } + + const std::vector<std::string> &deco = values(styles, "text-decoration"); + for (std::vector<std::string>::const_iterator it = deco.begin(); it != deco.end(); ++it) { + if (*it == "underline") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_UNDERLINED, true); + } else if (*it == "line-through") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_STRIKEDTHROUGH, true); + } else if (*it == "none") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_UNDERLINED, false); + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_STRIKEDTHROUGH, false); + } + } + + const std::vector<std::string> &bold = values(styles, "font-weight"); + if (!bold.empty()) { + //ZLLogger::Instance().println(ZLLogger::DEFAULT_CLASS, "bold: " + bold[0]); + int num = -1; + if (bold[0] == "bold") { + num = 700; + } else if (bold[0] == "normal") { + num = 400; + } else if (bold[0] == "bolder") { + // TODO: implement + } else if (bold[0] == "lighter") { + // TODO: implement + } else { + num = ZLStringUtil::stringToInteger(bold[0], -1); + } + if (num != -1) { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_BOLD, num >= 600); + } + } + + const std::vector<std::string> &italic = values(styles, "font-style"); + if (!italic.empty()) { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_ITALIC, italic[0] == "italic"); + } + + const std::vector<std::string> &variant = values(styles, "font-variant"); + if (!variant.empty()) { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_SMALLCAPS, variant[0] == "small-caps"); + } + + const std::vector<std::string> &fontFamily = values(styles, "font-family"); + if (!fontFamily.empty() && !fontFamily[0].empty()) { + entry->setFontFamily(fontFamily[0]); + //ZLLogger::Instance().println(ZLLogger::DEFAULT_CLASS, "font family: " + fontFamily[0]); + } + + const std::vector<std::string> &fontSize = values(styles, "font-size"); + if (!fontSize.empty()) { + //TODO implement FONT_MODIFIER_INHERIT, SMALLER and LARGER support + bool doSetFontSize = true; + short size = 100; + ZLTextStyleEntry::SizeUnit unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT; + if (fontSize[0] == "xx-small") { + size = 58; + } else if (fontSize[0] == "x-small") { + size = 69; + } else if (fontSize[0] == "small") { + size = 83; + } else if (fontSize[0] == "medium") { + size = 100; + } else if (fontSize[0] == "large") { + size = 120; + } else if (fontSize[0] == "x-large") { + size = 144; + } else if (fontSize[0] == "xx-large") { + size = 173; + } else if (fontSize[0] == "inherit") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_INHERIT, true); + doSetFontSize = false; + } else if (fontSize[0] == "smaller") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_SMALLER, true); + doSetFontSize = false; + } else if (fontSize[0] == "larger") { + entry->setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_LARGER, true); + doSetFontSize = false; + } else if (!parseLength(fontSize[0], size, unit)) { + doSetFontSize = false; + } + if (doSetFontSize) { + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, size, unit); + } + } + + setLength(*entry, ZLTextStyleEntry::LENGTH_LEFT_INDENT, styles, "margin-left"); + setLength(*entry, ZLTextStyleEntry::LENGTH_RIGHT_INDENT, styles, "margin-right"); + setLength(*entry, ZLTextStyleEntry::LENGTH_FIRST_LINE_INDENT_DELTA, styles, "text-indent"); + setLength(*entry, ZLTextStyleEntry::LENGTH_SPACE_BEFORE, styles, "margin-top"); + setLength(*entry, ZLTextStyleEntry::LENGTH_SPACE_BEFORE, styles, "padding-top"); + setLength(*entry, ZLTextStyleEntry::LENGTH_SPACE_AFTER, styles, "margin-bottom"); + setLength(*entry, ZLTextStyleEntry::LENGTH_SPACE_AFTER, styles, "padding-bottom"); + + return entry; +} + +void StyleSheetTable::clear() { + myControlMap.clear(); + myPageBreakBeforeMap.clear(); + myPageBreakAfterMap.clear(); +} diff --git a/reader/src/formats/css/StyleSheetTable.h b/reader/src/formats/css/StyleSheetTable.h new file mode 100644 index 0000000..54236fb --- /dev/null +++ b/reader/src/formats/css/StyleSheetTable.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __STYLESHEETTABLE_H__ +#define __STYLESHEETTABLE_H__ + +#include <string> +#include <map> +#include <vector> + +#include <shared_ptr.h> + +#include <ZLTextParagraph.h> +#include <ZLTextStyleEntry.h> + +class StyleSheetTable { + +public: + typedef std::map<std::string,std::vector<std::string> > AttributeMap; + static shared_ptr<ZLTextStyleEntry> createControl(const AttributeMap &map); + +private: + void addMap(const std::string &tag, const std::string &aClass, const AttributeMap &map); + + static void setLength(ZLTextStyleEntry &entry, ZLTextStyleEntry::Feature featureId, const AttributeMap &map, const std::string &attributeName); + static const std::vector<std::string> &values(const AttributeMap &map, const std::string &name); + +public: + bool isEmpty() const; + bool doBreakBefore(const std::string &tag, const std::string &aClass) const; + bool doBreakAfter(const std::string &tag, const std::string &aClass) const; + shared_ptr<ZLTextStyleEntry> control(const std::string &tag, const std::string &aClass) const; + + void clear(); + +private: + struct Key { + Key(const std::string &tag, const std::string &aClass); + + const std::string TagName; + const std::string ClassName; + + bool operator < (const Key &key) const; + }; + + std::map<Key,shared_ptr<ZLTextStyleEntry> > myControlMap; + std::map<Key,bool> myPageBreakBeforeMap; + std::map<Key,bool> myPageBreakAfterMap; + +friend class StyleSheetTableParser; +}; + +inline StyleSheetTable::Key::Key(const std::string &tag, const std::string &aClass) : TagName(tag), ClassName(aClass) { +} + +inline bool StyleSheetTable::Key::operator < (const StyleSheetTable::Key &key) const { + return (TagName < key.TagName) || ((TagName == key.TagName) && (ClassName < key.ClassName)); +} + +#endif /* __STYLESHEETTABLE_H__ */ diff --git a/reader/src/formats/doc/DocBookReader.cpp b/reader/src/formats/doc/DocBookReader.cpp new file mode 100644 index 0000000..99f471a --- /dev/null +++ b/reader/src/formats/doc/DocBookReader.cpp @@ -0,0 +1,377 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <vector> +#include <string> + +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLFileImage.h> + +#include "DocBookReader.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +#include "OleStorage.h" +#include "OleMainStream.h" + +DocBookReader::DocBookReader(BookModel &model, const std::string &encoding) : + myModelReader(model), + myPictureCounter(0), + myEncoding(encoding) { + myReadState = READ_TEXT; +} + +bool DocBookReader::readBook() { + const ZLFile &file = myModelReader.model().book()->file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + myModelReader.beginParagraph(); + + if (!readDocument(stream, true)) { + return false; + } + + myModelReader.insertEndOfTextParagraph(); + return true; +} + +void DocBookReader::handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_INFO) { + myFieldInfoBuffer.push_back(ucs2char); + return; + } + if (myReadState == READ_FIELD && myReadFieldState == DONT_READ_FIELD_TEXT) { + return; + } + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && ucs2char == WORD_HORIZONTAL_TAB) { + //to remove pagination from TOC (from doc saved in OpenOffice) + myReadFieldState = DONT_READ_FIELD_TEXT; + return; + } + std::string utf8String; + ZLUnicodeUtil::Ucs2String ucs2String; + ucs2String.push_back(ucs2char); + ZLUnicodeUtil::ucs2ToUtf8(utf8String, ucs2String); + if (!myModelReader.paragraphIsOpen()) { + myModelReader.beginParagraph(); + } + myModelReader.addData(utf8String); +} + +void DocBookReader::handleHardLinebreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + if (!myCurrentStyleEntry.isNull()) { + myModelReader.addStyleEntry(*myCurrentStyleEntry); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphEnd() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myModelReader.beginParagraph(); + myCurrentStyleEntry = 0; +} + +void DocBookReader::handlePageBreak() { + if (myModelReader.paragraphIsOpen()) { + myModelReader.endParagraph(); + } + myCurrentStyleEntry = 0; + myModelReader.insertEndOfSectionParagraph(); + myModelReader.beginParagraph(); +} + +void DocBookReader::handleTableSeparator() { + handleChar(SPACE); + handleChar(VERTICAL_LINE); + handleChar(SPACE); +} + +void DocBookReader::handleTableEndRow() { + handleParagraphEnd(); +} + +void DocBookReader::handleFootNoteMark() { + //TODO implement +} + +void DocBookReader::handleStartField() { + if (myReadState == READ_FIELD) { //for nested fields + handleEndField(); + } + myReadState = READ_FIELD; + myReadFieldState = READ_FIELD_INFO; + myHyperlinkTypeState = NO_HYPERLINK; +} + +void DocBookReader::handleSeparatorField() { + static const std::string HYPERLINK = "HYPERLINK"; + static const std::string SEQUENCE = "SEQ"; +// static const std::string PAGE = "PAGE"; +// static const std::string PAGEREF = "PAGEREF"; +// static const std::string SHAPE = "SHAPE"; + static const std::string SPACE_DELIMETER = " "; + static const std::string LOCAL_LINK = "\\l"; + static const std::string QUOTE = "\""; + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + ZLUnicodeUtil::Ucs2String buffer = myFieldInfoBuffer; + myFieldInfoBuffer.clear(); + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, buffer); + ZLUnicodeUtil::utf8Trim(utf8String); + if (utf8String.empty()) { + return; + } + std::vector<std::string> result = ZLStringUtil::split(utf8String, SPACE_DELIMETER); + //TODO split function can returns empty string, maybe fix it + std::vector<std::string> splitted; + for (std::size_t i = 0; i < result.size(); ++i) { + if (!result.at(i).empty()) { + splitted.push_back(result.at(i)); + } + } + + if (!splitted.empty() && splitted.at(0) == SEQUENCE) { + myReadFieldState = READ_FIELD_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + return; + } + + if (splitted.size() < 2 || splitted.at(0) != HYPERLINK) { + myReadFieldState = DONT_READ_FIELD_TEXT; + //to remove pagination from TOC and not hyperlink fields + return; + } + + if (splitted.at(1) == LOCAL_LINK) { + std::string link = parseLink(buffer); + if (!link.empty()) { + myModelReader.addHyperlinkControl(INTERNAL_HYPERLINK, link); + myHyperlinkTypeState = INT_HYPERLINK_INSERTED; + } + } else { + std::string link = parseLink(buffer, true); + if (!link.empty()) { + myModelReader.addHyperlinkControl(EXTERNAL_HYPERLINK, link); + myHyperlinkTypeState = EXT_HYPERLINK_INSERTED; + } + } +} + +void DocBookReader::handleEndField() { + myFieldInfoBuffer.clear(); + if (myReadState == READ_TEXT) { + return; + } + if (myHyperlinkTypeState == EXT_HYPERLINK_INSERTED) { + myModelReader.addControl(EXTERNAL_HYPERLINK, false); + } else if (myHyperlinkTypeState == INT_HYPERLINK_INSERTED) { + myModelReader.addControl(INTERNAL_HYPERLINK, false); + } + myReadState = READ_TEXT; + myHyperlinkTypeState = NO_HYPERLINK; + +} + +void DocBookReader::handleImage(const ZLFileImage::Blocks &blocks) { + std::string number; + ZLStringUtil::appendNumber(number, myPictureCounter++); + myModelReader.addImageReference(number); + ZLFile file(myModelReader.model().book()->file().path(), ZLMimeType::IMAGE_AUTO); + myModelReader.addImage(number, new ZLFileImage(file, blocks, ZLFileImage::ENCODING_NONE)); +} + +void DocBookReader::handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) { + if (ucs2char == WORD_MINUS) { + handleChar(MINUS); + } else if (ucs2char == WORD_SOFT_HYPHEN) { + //skip + } else if (ucs2char == WORD_HORIZONTAL_TAB) { + handleChar(ucs2char); + } else { +// myTextBuffer.clear(); + } +} + +void DocBookReader::handleFontStyle(unsigned int fontStyle) { + if (myReadState == READ_FIELD && myReadFieldState == READ_FIELD_TEXT && myHyperlinkTypeState != NO_HYPERLINK) { + //to fix bug with hyperlink, that's only bold and doesn't looks like hyperlink + return; + } + while (!myKindStack.empty()) { + myModelReader.addControl(myKindStack.back(), false); + myKindStack.pop_back(); + } + if (fontStyle & OleMainStream::CharInfo::FONT_BOLD) { + myKindStack.push_back(BOLD); + } + if (fontStyle & OleMainStream::CharInfo::FONT_ITALIC) { + myKindStack.push_back(ITALIC); + } + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } +} + +void DocBookReader::handleParagraphStyle(const OleMainStream::Style &styleInfo) { + if (styleInfo.HasPageBreakBefore) { + handlePageBreak(); + } + shared_ptr<ZLTextStyleEntry> entry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + + switch (styleInfo.Alignment) { + default: // in that case, use default alignment type + break; + case OleMainStream::Style::ALIGNMENT_LEFT: + entry->setAlignmentType(ALIGN_LEFT); + break; + case OleMainStream::Style::ALIGNMENT_RIGHT: + entry->setAlignmentType(ALIGN_RIGHT); + break; + case OleMainStream::Style::ALIGNMENT_CENTER: + entry->setAlignmentType(ALIGN_CENTER); + break; + case OleMainStream::Style::ALIGNMENT_JUSTIFY: + entry->setAlignmentType(ALIGN_JUSTIFY); + break; + } + + //TODO in case, where style is heading, but size is small it works wrong + const ZLTextStyleEntry::SizeUnit unit = ZLTextStyleEntry::SIZE_UNIT_PERCENT; + switch (styleInfo.StyleIdCurrent) { + default: + break; + case OleMainStream::Style::STYLE_H1: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 140, unit); + break; + case OleMainStream::Style::STYLE_H2: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 120, unit); + break; + case OleMainStream::Style::STYLE_H3: + entry->setLength(ZLTextStyleEntry::LENGTH_FONT_SIZE, 110, unit); + break; + } + myCurrentStyleEntry = entry; + myModelReader.addStyleEntry(*myCurrentStyleEntry); + + // we should have the same font style, as for the previous paragraph, + // if it has the same StyleIdCurrent + if (myCurrentStyleInfo.StyleIdCurrent != OleMainStream::Style::STYLE_INVALID && + myCurrentStyleInfo.StyleIdCurrent == styleInfo.StyleIdCurrent) { + for (std::size_t i = 0; i < myKindStack.size(); ++i) { + myModelReader.addControl(myKindStack.at(i), true); + } + } else { + myKindStack.clear(); + // fill by the fontstyle, that was got from Stylesheet + handleFontStyle(styleInfo.CurrentCharInfo.FontStyle); + } + myCurrentStyleInfo = styleInfo; +} + +void DocBookReader::handleBookmark(const std::string &name) { + myModelReader.addHyperlinkLabel(name); +} + +std::string DocBookReader::parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode) { + //TODO add support for HYPERLINK like that: + // [0x13] HYPERLINK "http://site.ru/some text" \t "_blank" [0x14] text [0x15] + //Current implementation search for last QUOTE, so, it reads \t and _blank as part of link + //Last quote searching is need to handle link like that: + // [0x13] HYPERLINK "http://yandex.ru/yandsearch?text='some text' и "some text2"" [0x14] link text [0x15] + + static const ZLUnicodeUtil::Ucs2Char QUOTE = 0x22; + std::size_t i, first = 0; + //TODO maybe functions findFirstOf and findLastOf should be in ZLUnicodeUtil class + for (i = 0; i < s.size(); ++i) { + if (s.at(i) == QUOTE) { + first = i; + break; + } + } + if (i == s.size()) { + return std::string(); + } + std::size_t j, last = 0; + for (j = s.size(); j > 0 ; --j) { + if (s.at(j - 1) == QUOTE) { + last = j - 1; + break; + } + } + if (j == 0 || last == first) { + return std::string(); + } + + ZLUnicodeUtil::Ucs2String link; + for (std::size_t k = first + 1; k < last; ++k) { + ZLUnicodeUtil::Ucs2Char ch = s.at(k); + if (urlencode && ZLUnicodeUtil::isSpace(ch)) { + //TODO maybe implement function for encoding all signs in url, not only spaces and quotes + //TODO maybe add backslash support + link.push_back('%'); + link.push_back('2'); + link.push_back('0'); + } else if (urlencode && ch == QUOTE) { + link.push_back('%'); + link.push_back('2'); + link.push_back('2'); + } else { + link.push_back(ch); + } + } + std::string utf8String; + ZLUnicodeUtil::ucs2ToUtf8(utf8String, link); + return utf8String; +} + +void DocBookReader::footnotesStartHandler() { + handlePageBreak(); +} + +void DocBookReader::ansiDataHandler(const char *buffer, std::size_t len) { + if (myConverter.isNull()) { + // lazy converter initialization + ZLEncodingCollection &collection = ZLEncodingCollection::Instance(); + ZLEncodingConverterInfoPtr info = collection.info(myEncoding); + myConverter = info.isNull() ? collection.defaultConverter() : info->createConverter(); + } + std::string utf8String; + myConverter->convert(utf8String, buffer, buffer + len); + ZLUnicodeUtil::utf8ToUcs2(myBuffer, utf8String); +} + +void DocBookReader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) { + myBuffer.push_back(symbol); +} diff --git a/reader/src/formats/doc/DocBookReader.h b/reader/src/formats/doc/DocBookReader.h new file mode 100644 index 0000000..d80fb8e --- /dev/null +++ b/reader/src/formats/doc/DocBookReader.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKREADER_H__ +#define __DOCBOOKREADER_H__ + +#include <vector> + +#include <shared_ptr.h> +#include <ZLFile.h> +#include <ZLTextStyleEntry.h> +#include <ZLEncodingConverter.h> + +#include "../../bookmodel/BookReader.h" + +#include "OleMainStream.h" +#include "OleStreamParser.h" + +class DocBookReader : public OleStreamParser { + +public: + DocBookReader(BookModel &model, const std::string &encoding); + ~DocBookReader(); + bool readBook(); + +private: + void ansiDataHandler(const char *buffer, std::size_t len); + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); + void footnotesStartHandler(); + + void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char); + void handleHardLinebreak(); + void handleParagraphEnd(); + void handlePageBreak(); + void handleTableSeparator(); + void handleTableEndRow(); + void handleFootNoteMark(); + void handleStartField(); + void handleSeparatorField(); + void handleEndField(); + void handleImage(const ZLFileImage::Blocks &blocks); + void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char); + + //formatting: + void handleFontStyle(unsigned int fontStyle); + void handleParagraphStyle(const OleMainStream::Style &styleInfo); + void handleBookmark(const std::string &name); + +private: + static std::string parseLink(ZLUnicodeUtil::Ucs2String s, bool urlencode = false); + +private: + BookReader myModelReader; + + ZLUnicodeUtil::Ucs2String myFieldInfoBuffer; + + enum { + READ_FIELD, + READ_TEXT + } myReadState; + + enum { + READ_FIELD_TEXT, + DONT_READ_FIELD_TEXT, + READ_FIELD_INFO + } myReadFieldState; + + //maybe it should be flag? + enum { + NO_HYPERLINK, + EXT_HYPERLINK_INSERTED, + INT_HYPERLINK_INSERTED + } myHyperlinkTypeState; + + //formatting + std::vector<FBTextKind> myKindStack; + shared_ptr<ZLTextStyleEntry> myCurrentStyleEntry; + OleMainStream::Style myCurrentStyleInfo; + unsigned int myPictureCounter; + + const std::string myEncoding; + shared_ptr<ZLEncodingConverter> myConverter; +}; + +inline DocBookReader::~DocBookReader() {} + +#endif /* __DOCBOOKREADER_H__ */ diff --git a/reader/src/formats/doc/DocFloatImageReader.cpp b/reader/src/formats/doc/DocFloatImageReader.cpp new file mode 100644 index 0000000..8c308e4 --- /dev/null +++ b/reader/src/formats/doc/DocFloatImageReader.cpp @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleUtil.h" +#include "OleStream.h" +#include "OleMainStream.h" + +#include "DocFloatImageReader.h" + +DocFloatImageReader::DocFloatImageReader(unsigned int off, unsigned int len, shared_ptr<OleStream> tableStream, shared_ptr<OleStream> mainStream) : + myTableStream(tableStream), + myMainStream(mainStream), + myOffset(off), + myLength(len) { +} + +void DocFloatImageReader::readAll() { + //OfficeArtContent structure is described at p.405-406 [MS-DOC] + if (!myTableStream->seek(myOffset, true)) { + ZLLogger::Instance().println("DocPlugin", "problems with reading float images"); + return; + } + + unsigned int count = 0; + + RecordHeader header; + while (count < myLength) { + count += readRecordHeader(header, myTableStream); + switch (header.type) { + case 0xF000: + count += readDggContainer(myItem, header.length, myTableStream, myMainStream); + break; + case 0xF002: + count += readDgContainer(myItem, header.length, myTableStream); + break; + default: + return; + break; + } + } +} + +ZLFileImage::Blocks DocFloatImageReader::getBlocksForShapeId(unsigned int shapeId) const { + FSPContainer container; + bool found = false; + for (std::size_t i = 0; !found && i < myItem.FSPs.size(); ++i) { + if (myItem.FSPs.at(i).fsp.shapeId == shapeId) { + found = true; + container = myItem.FSPs.at(i); + } + } + + if (!found || container.fopte.empty()) { + return ZLFileImage::Blocks(); + } + + for (std::size_t i = 0; i < container.fopte.size(); ++i) { + const FOPTE &fopte = container.fopte.at(i); + if (fopte.pId == 0x0104 && !fopte.isComplex) { //0x0104 specifies the BLIP, see p.420 [MS-ODRAW] + if (fopte.value <= myItem.blips.size() && fopte.value > 0) { + Blip blip = myItem.blips.at(fopte.value - 1); + return blip.blocks; + } + } + } + return ZLFileImage::Blocks(); +} + +unsigned int DocFloatImageReader::readRecordHeader(RecordHeader &header, shared_ptr<OleStream> stream) { + //OfficeArtRecordHeader structure is described at p.26 [MS-ODRAW] + char buffer[8]; + stream->read(buffer, 8); + unsigned int temp = OleUtil::getU2Bytes(buffer, 0); + header.version = temp & 0x000F; + header.instance = temp >> 4; + header.type = OleUtil::getU2Bytes(buffer, 2); + header.length = OleUtil::getU4Bytes(buffer, 4); + return 8; +} + +unsigned int DocFloatImageReader::readDggContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtDggContainer structure is described at p.50 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF001: + count += readBStoreContainer(item, header.length, stream, mainStream); + break; + default: + count += skipRecord(header, stream); + break; + } + } + + stream->seek(1, false); //skipping dgglbl (see p.406 [MS-DOC]) + ++count; + + return count; +} + +unsigned int DocFloatImageReader::readBStoreContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtBStoreContainer structure is described at p.58 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF007: + { + Blip blip; + count += readBStoreContainerFileBlock(blip, stream, mainStream); + item.blips.push_back(blip); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::skipRecord(const RecordHeader &header, shared_ptr<OleStream> stream) { + stream->seek(header.length, false); + return header.length; +} + +unsigned int DocFloatImageReader::readBStoreContainerFileBlock(Blip &blip, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream) { + //OfficeArtBStoreContainerFileBlock structure is described at p.59 [MS-ODRAW] + unsigned int count = readFBSE(blip.storeEntry, stream); + if (blip.storeEntry.offsetInDelay != (unsigned int)-1) { + if (mainStream->seek(blip.storeEntry.offsetInDelay, true)) { //see p.70 [MS-ODRAW] + //TODO maybe we should stop reading float images here + ZLLogger::Instance().println("DocPlugin", "DocFloatImageReader: problems with seeking for offset"); + return count; + } + } + RecordHeader header; + unsigned int count2 = readRecordHeader(header, mainStream); + switch (header.type) { + case OleMainStream::IMAGE_WMF: + case OleMainStream::IMAGE_EMF: + case OleMainStream::IMAGE_PICT: + count2 += skipRecord(header, mainStream); + break; + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + case OleMainStream::IMAGE_PNG: + case OleMainStream::IMAGE_DIB: + case OleMainStream::IMAGE_TIFF: + count2 += readBlip(blip, header, mainStream); + break; + } + blip.type = header.type; + return count; +} + +unsigned int DocFloatImageReader::readBlip(Blip &blip, const RecordHeader &header, shared_ptr<OleStream> stream) { + //OfficeArtBlip structure is described at p.60-66 [MS-ODRAW] + stream->seek(16, false); //skipping rgbUid1 + unsigned int count = 16; + + bool addField = false; + switch (header.type) { + case OleMainStream::IMAGE_PNG: + if (header.instance == 0x6E1) { + addField = true; + } + break; + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + if (header.instance == 0x46B || header.instance == 0x6E3) { + addField = true; + } + break; + case OleMainStream::IMAGE_DIB: + if (header.instance == 0x7A9) { + addField = true; + } + case OleMainStream::IMAGE_TIFF: + if (header.instance == 0x6E5) { + addField = true; + } + break; + } + + if (addField) { + stream->seek(16, false); //skipping rgbUid2 + count += 16; + } + stream->seek(1, false); //skipping tag + count += 1; + + blip.blocks = stream->getBlockPieceInfoList(stream->offset(), header.length - count); + count += header.length; + return count; +} + +unsigned int DocFloatImageReader::readFBSE(BlipStoreEntry &fbse, shared_ptr<OleStream> stream) { + //OfficeArtFBSE structure is described at p.68 [MS-ODRAW] + stream->seek(2, false); //skipping btWin32 and btMacOS + stream->seek(16, false); //skipping rgbUid + stream->seek(2, false); //skipping tag + fbse.size = read4Bytes(stream); + fbse.referenceCount = read4Bytes(stream); + fbse.offsetInDelay = read4Bytes(stream); + stream->seek(1, false); //skipping unused value + unsigned int lengthName = read1Byte(stream); //if it should be multiplied on 2? + stream->seek(2, false); // skipping unused values + if (lengthName > 0) { + stream->seek(lengthName, false); //skipping nameData + } + return 36 + lengthName; +} + +unsigned int DocFloatImageReader::readDgContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtDgContainer structure is described at p.52 [MS-ODRAW] + unsigned int count = 0; + + RecordHeader header; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF008: //skip OfficeArtFDG record, p. 82 [MS-ODRAW] + stream->seek(8, false); + count += 8; + break; + case 0xF003: + count += readSpgrContainer(item, header.length, stream); + break; + case 0xF004: + { + FSPContainer fspContainer; + count += readSpContainter(fspContainer, header.length, stream); + item.FSPs.push_back(fspContainer); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readSpgrContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtSpgrContainer structure is described at p.56 [MS-ODRAW] + unsigned count = 0; + RecordHeader header; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF003: + count += readSpgrContainer(item, header.length, stream); + break; + case 0xF004: + { + FSPContainer fspContainer; + count += readSpContainter(fspContainer, header.length, stream); + item.FSPs.push_back(fspContainer); + } + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readSpContainter(FSPContainer &item, unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtSpContainter structure is described at p.53-55 [MS-ODRAW] + RecordHeader header; + unsigned int count = 0; + while (count < length) { + count += readRecordHeader(header, stream); + switch (header.type) { + case 0xF009: //skip OfficeArtFSPGR record, p.74 [MS-ODRAW] + stream->seek(16, false); + count += 16; + break; + case 0xF00A: + count += readFSP(item.fsp, stream); + break; + case 0xF00B: + count += readArrayFOPTE(item.fopte, header.length, stream); + break; + case 0xF00E: //OfficeArtAnchor + case 0xF00F: //OfficeArtChildAnchor, p.75 [MS-ODRAW] + case 0xF010: //OfficeArtClientAnchor + stream->seek(4, false); + count += 4; + break; + case 0xF00C: + case 0xF11F: + case 0xF11D: + break; + default: + count += skipRecord(header, stream); + break; + } + } + return count; +} + +unsigned int DocFloatImageReader::readFSP(FSP &fsp, shared_ptr<OleStream> stream) { + //OfficeArtFSP structure is described at p.76 [MS-ODRAW] + fsp.shapeId = read4Bytes(stream); + stream->seek(4, false); + return 8; +} + +unsigned int DocFloatImageReader::readArrayFOPTE(std::vector<FOPTE> &fopteArray,unsigned int length, shared_ptr<OleStream> stream) { + //OfficeArtRGFOPTE structure is described at p.98 [MS-ODRAW] + unsigned int count = 0; + while (count < length) { + FOPTE fopte; + count += readFOPTE(fopte, stream); + fopteArray.push_back(fopte); + } + for (std::size_t i = 0; i < fopteArray.size(); ++i) { + if (fopteArray.at(i).isComplex) { + stream->seek(fopteArray.at(i).value, false); + count += fopteArray.at(i).value; + } + } + return count; +} + +unsigned int DocFloatImageReader::readFOPTE(FOPTE &fopte, shared_ptr<OleStream> stream) { + //OfficeArtFOPTE structure is described at p.32 [MS-ODRAW] + unsigned int dtemp; + dtemp = read2Bytes(stream); + fopte.pId = (dtemp & 0x3fff); + fopte.isBlipId = ((dtemp & 0x4000) >> 14) == 0x1; + fopte.isComplex = ((dtemp & 0x8000) >> 15) == 0x1; + fopte.value = read4Bytes(stream); + return 6; +} + +unsigned int DocFloatImageReader::read1Byte(shared_ptr<OleStream> stream) { + char b[1]; + if (stream->read(b, 1) != 1) { + return 0; + } + return OleUtil::getU1Byte(b, 0); +} + +unsigned int DocFloatImageReader::read2Bytes(shared_ptr<OleStream> stream) { + char b[2]; + if (stream->read(b, 2) != 2) { + return 0; + } + return OleUtil::getU2Bytes(b, 0); +} + +unsigned int DocFloatImageReader::read4Bytes(shared_ptr<OleStream> stream) { + char b[4]; + if (stream->read(b, 4) != 4) { + return 0; + } + return OleUtil::getU4Bytes(b, 0); +} diff --git a/reader/src/formats/doc/DocFloatImageReader.h b/reader/src/formats/doc/DocFloatImageReader.h new file mode 100644 index 0000000..d2d6c2e --- /dev/null +++ b/reader/src/formats/doc/DocFloatImageReader.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCFLOATIMAGEREADER_H__ +#define __DOCFLOATIMAGEREADER_H__ + +#include <ZLFileImage.h> + +class DocFloatImageReader { + +public: + struct BlipStoreEntry { // see p.68 [MS-ODRAW] + unsigned int size; // size of blip in stream + unsigned int referenceCount; // (cRef) reference count for the the blip + unsigned int offsetInDelay; // foDelay, file offset in the delay stream + }; + + struct Blip { //see p.59, p63-66 [MS-ODRAW] + BlipStoreEntry storeEntry; + unsigned int type; + ZLFileImage::Blocks blocks; + }; + + struct FSP { //see p.76-77 [MS-ODRAW] + unsigned int shapeId; //spid + }; + + struct FOPTE { //see p.98 and p.32 [MS-ODRAW] + unsigned int pId; //pid + bool isBlipId; //fBid + bool isComplex; //fComplex + unsigned int value; //op + }; + + struct FSPContainer { //see p.53-55 [MS-ODRAW] + FSP fsp; + std::vector<FOPTE> fopte; + }; + + struct OfficeArtContent { //see p.405-406 [MS-DOC] + std::vector<Blip> blips; //retrieved from OfficeArtDggContainer + std::vector<FSPContainer> FSPs; //retrieved from OfficeArtDgContainer + }; + + struct RecordHeader { //see p.26 [MS-ODRAW] + unsigned int version; + unsigned int instance; + unsigned int type; + unsigned int length; + }; + +public: + DocFloatImageReader(unsigned int off, unsigned int len, shared_ptr<OleStream> tableStream, shared_ptr<OleStream> mainStream); + +public: + void readAll(); + + ZLFileImage::Blocks getBlocksForShapeId(unsigned int shapeId) const; + +private: + static unsigned int readRecordHeader(RecordHeader &header, shared_ptr<OleStream> stream); + static unsigned int readDggContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + + static unsigned int readBStoreContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + static unsigned int readBStoreContainerFileBlock(Blip &blip, shared_ptr<OleStream> stream, shared_ptr<OleStream> mainStream); + static unsigned int readBlip(Blip &blip, const RecordHeader &header, shared_ptr<OleStream> stream); + static unsigned int readFBSE(BlipStoreEntry &fbse, shared_ptr<OleStream> stream); + + static unsigned int readFOPTE(FOPTE &fopte, shared_ptr<OleStream> stream); + static unsigned int readArrayFOPTE(std::vector<FOPTE> &fopte, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readFSP(FSP &fsp, shared_ptr<OleStream> stream); + static unsigned int readSpContainter(FSPContainer &item, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readSpgrContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream); + static unsigned int readDgContainer(OfficeArtContent &item, unsigned int length, shared_ptr<OleStream> stream); + + static unsigned int skipRecord(const RecordHeader &header, shared_ptr<OleStream> stream); + + static unsigned int read1Byte(shared_ptr<OleStream> stream); + static unsigned int read2Bytes(shared_ptr<OleStream> stream); + static unsigned int read4Bytes(shared_ptr<OleStream> stream); + +private: + shared_ptr<OleStream> myTableStream; + shared_ptr<OleStream> myMainStream; + unsigned int myOffset; + unsigned int myLength; + + OfficeArtContent myItem; +}; + +#endif /* __DOCFLOATIMAGEREADER_H__ */ diff --git a/reader/src/formats/doc/DocInlineImageReader.cpp b/reader/src/formats/doc/DocInlineImageReader.cpp new file mode 100644 index 0000000..69ce74f --- /dev/null +++ b/reader/src/formats/doc/DocInlineImageReader.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "OleUtil.h" +#include "OleMainStream.h" + +#include "DocInlineImageReader.h" + +DocInlineImageReader::DocInlineImageReader(shared_ptr<OleStream> dataStream) : + myDataStream(dataStream) { +} + +ZLFileImage::Blocks DocInlineImageReader::getImagePieceInfo(unsigned int dataPos) { + if (myDataStream.isNull()) { + return ZLFileImage::Blocks(); + } + if (!myDataStream->seek(dataPos, true)) { + return ZLFileImage::Blocks(); + } + + //reading PICF structure (see p. 421 [MS-DOC]) + unsigned int picfHeaderSize = 4 + 2 + 8; //record length, headerLength and storage format + char headerBuffer[picfHeaderSize]; + if (myDataStream->read(headerBuffer, picfHeaderSize) != picfHeaderSize) { + return ZLFileImage::Blocks(); + } + unsigned int length = OleUtil::getU4Bytes(headerBuffer, 0); + unsigned int headerLength = OleUtil::getU2Bytes(headerBuffer, 4); + unsigned int formatType = OleUtil::getU2Bytes(headerBuffer, 6); + + if (formatType != 0x0064) { //external link to some file; see p.394 [MS-DOC] + //TODO implement + return ZLFileImage::Blocks(); + } + if (headerLength >= length) { + return ZLFileImage::Blocks(); + } + + //reading OfficeArtInlineSpContainer structure; see p.421 [MS-DOC] and p.56 [MS-ODRAW] + if (!myDataStream->seek(headerLength - picfHeaderSize, false)) { //skip header + return ZLFileImage::Blocks(); + } + + char buffer[8]; //for OfficeArtRecordHeader structure; see p.69 [MS-ODRAW] + bool found = false; + unsigned int curOffset = 0; + for (curOffset = headerLength; !found && curOffset + 8 <= length; curOffset += 8) { + if (myDataStream->read(buffer, 8) != 8) { + return ZLFileImage::Blocks(); + } + unsigned int recordInstance = OleUtil::getU2Bytes(buffer, 0) >> 4; + unsigned int recordType = OleUtil::getU2Bytes(buffer, 2); + unsigned int recordLen = OleUtil::getU4Bytes(buffer, 4); + + switch (recordType) { + case 0xF000: case 0xF001: case 0xF002: case 0xF003: case 0xF004: case 0xF005: + break; + case 0xF007: + { + myDataStream->seek(33, false); + char tmpBuf[1]; + myDataStream->read(tmpBuf, 1); + unsigned int nameLength = OleUtil::getU1Byte(tmpBuf, 0); + myDataStream->seek(nameLength * 2 + 2, false); + curOffset += 33 + 1 + nameLength * 2 + 2; + } + break; + case 0xF008: + myDataStream->seek(8, false); + curOffset += 8; + break; + case 0xF009: + myDataStream->seek(16, false); + curOffset += 16; + break; + case 0xF006: case 0xF00A: case 0xF00B: case 0xF00D: case 0xF00E: case 0xF00F: case 0xF010: case 0xF011: case 0xF122: + myDataStream->seek(recordLen, false); + curOffset += recordLen; + break; + case OleMainStream::IMAGE_EMF: + case OleMainStream::IMAGE_WMF: + case OleMainStream::IMAGE_PICT: + //TODO implement + return ZLFileImage::Blocks(); + case OleMainStream::IMAGE_JPEG: + case OleMainStream::IMAGE_JPEG2: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x46B || recordInstance == 0x6E3) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_PNG: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x6E1) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_DIB: // DIB = BMP without 14-bytes header + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x7A9) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case OleMainStream::IMAGE_TIFF: + myDataStream->seek(17, false); + curOffset += 17; + if (recordInstance == 0x6E5) { + myDataStream->seek(16, false); + curOffset += 16; + } + found = true; + break; + case 0xF00C: + default: + return ZLFileImage::Blocks(); + } + } + + if (!found) { + return ZLFileImage::Blocks(); + } + return myDataStream->getBlockPieceInfoList(dataPos + curOffset, length - curOffset); +} diff --git a/reader/src/formats/doc/DocInlineImageReader.h b/reader/src/formats/doc/DocInlineImageReader.h new file mode 100644 index 0000000..9dab9ae --- /dev/null +++ b/reader/src/formats/doc/DocInlineImageReader.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCINLINEIMAGEREADER_H__ +#define __DOCINLINEIMAGEREADER_H__ + +#include <vector> + +#include "OleStream.h" + +class DocInlineImageReader { + +public: + DocInlineImageReader(shared_ptr<OleStream> dataStream); + ZLFileImage::Blocks getImagePieceInfo(unsigned int dataPos); + +private: + shared_ptr<OleStream> myDataStream; +}; + +#endif /* __DOCINLINEIMAGEREADER_H__ */ diff --git a/reader/src/formats/doc/DocMetaInfoReader.cpp b/reader/src/formats/doc/DocMetaInfoReader.cpp new file mode 100644 index 0000000..37b39c2 --- /dev/null +++ b/reader/src/formats/doc/DocMetaInfoReader.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "../../library/Book.h" + +#include "DocMetaInfoReader.h" + +DocMetaInfoReader::DocMetaInfoReader(Book &book) : myBook(book) { + myBook.removeAllAuthors(); + myBook.setTitle(std::string()); + myBook.setLanguage(std::string()); + myBook.removeAllTags(); +} + +bool DocMetaInfoReader::readMetaInfo() { + myBook.removeAllAuthors(); + myBook.setTitle(myBook.file().name(true)); + myBook.removeAllTags(); + return true; +} diff --git a/reader/src/formats/doc/DocMetaInfoReader.h b/reader/src/formats/doc/DocMetaInfoReader.h new file mode 100644 index 0000000..db26d29 --- /dev/null +++ b/reader/src/formats/doc/DocMetaInfoReader.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCMETAINFOREADER_H__ +#define __DOCMETAINFOREADER_H__ + +#include <string> + +class Book; + +class DocMetaInfoReader { + +public: + DocMetaInfoReader(Book &book); + ~DocMetaInfoReader(); + bool readMetaInfo(); + + /* + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + */ + +private: + Book &myBook; +}; + +inline DocMetaInfoReader::~DocMetaInfoReader() {} + +#endif /* __DOCMETAINFOREADER_H__ */ diff --git a/reader/src/formats/doc/DocPlugin.cpp b/reader/src/formats/doc/DocPlugin.cpp new file mode 100644 index 0000000..ef6f511 --- /dev/null +++ b/reader/src/formats/doc/DocPlugin.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLImage.h> +#include <ZLEncodingConverter.h> + +#include "DocPlugin.h" +#include "DocMetaInfoReader.h" +#include "DocBookReader.h" +#include "DocStreams.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +DocPlugin::DocPlugin() { +} + +DocPlugin::~DocPlugin() { +} + +bool DocPlugin::providesMetaInfo() const { + return true; +} + +const std::string DocPlugin::supportedFileType() const { + return "doc"; +} + +bool DocPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "doc"; +} + +bool DocPlugin::readMetaInfo(Book &book) const { + if (!DocMetaInfoReader(book).readMetaInfo()) { + return false; + } + + shared_ptr<ZLInputStream> stream = new DocAnsiStream(book.file(), 50000); + if (!detectEncodingAndLanguage(book, *stream)) { + stream = new DocUcs2Stream(book.file(), 50000); + detectLanguage(book, *stream, ZLEncodingConverter::UTF8, true); + } + + return true; +} + +bool DocPlugin::readLanguageAndEncoding(Book &/*book*/) const { + return true; +} + +bool DocPlugin::readModel(BookModel &model) const { + return DocBookReader(model, model.book()->encoding()).readBook(); +} diff --git a/reader/src/formats/doc/DocPlugin.h b/reader/src/formats/doc/DocPlugin.h new file mode 100644 index 0000000..93b1803 --- /dev/null +++ b/reader/src/formats/doc/DocPlugin.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCPLUGIN_H__ +#define __DOCPLUGIN_H__ + +#include "../FormatPlugin.h" + +class DocPlugin : public FormatPlugin { + +public: + DocPlugin(); + ~DocPlugin(); + bool providesMetaInfo() const; + + const std::string supportedFileType() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +#endif /* __DOCPLUGIN_H__ */ diff --git a/reader/src/formats/doc/DocStreams.cpp b/reader/src/formats/doc/DocStreams.cpp new file mode 100644 index 0000000..b21e15a --- /dev/null +++ b/reader/src/formats/doc/DocStreams.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cstdlib> +#include <string> + +#include "DocStreams.h" +#include "OleStreamReader.h" + +class DocReader : public OleStreamReader { + +public: + DocReader(char *buffer, std::size_t maxSize); + ~DocReader(); + std::size_t readSize() const; + +private: + bool readStream(OleMainStream &stream); + void ansiDataHandler(const char *buffer, std::size_t len); + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); + void footnotesStartHandler(); + +protected: + char *myBuffer; + const std::size_t myMaxSize; + std::size_t myActualSize; +}; + +class DocAnsiReader : public DocReader { + +public: + DocAnsiReader(char *buffer, std::size_t maxSize); + ~DocAnsiReader(); + +private: + void ansiDataHandler(const char *buffer, std::size_t len); +}; + +class DocUcs2Reader : public DocReader { + +public: + DocUcs2Reader(char *buffer, std::size_t maxSize); + ~DocUcs2Reader(); + +private: + void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol); +}; + +DocReader::DocReader(char *buffer, std::size_t maxSize) : myBuffer(buffer), myMaxSize(maxSize), myActualSize(0) { +} + +DocReader::~DocReader() { +} + +bool DocReader::readStream(OleMainStream &stream) { + // TODO make 2 optmizations: + // 1) If another piece is too big, reading of next piece can be stopped if some size parameter will be specified + // (it can be transfered as a parameter (with default 0 value, that means no need to use it) to readNextPiece method) + // 2) We can specify as a parameter for readNextPiece, what kind of piece should be read next (ANSI or not ANSI). + // As type of piece is known already, there's no necessary to read other pieces. + while (myActualSize < myMaxSize) { + if (!readNextPiece(stream)) { + break; + } + } + return true; +} + +void DocReader::ansiDataHandler(const char*, std::size_t) { +} + +void DocReader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char) { +} + +void DocReader::footnotesStartHandler() { +} + +std::size_t DocReader::readSize() const { + return myActualSize; +} + +DocAnsiReader::DocAnsiReader(char *buffer, std::size_t maxSize) : DocReader(buffer, maxSize) { +} + +DocAnsiReader::~DocAnsiReader() { +} + +void DocAnsiReader::ansiDataHandler(const char *buffer, std::size_t dataLength) { + if (myActualSize < myMaxSize) { + const std::size_t len = std::min(dataLength, myMaxSize - myActualSize); + std::strncpy(myBuffer + myActualSize, buffer, len); + myActualSize += len; + } +} + +DocUcs2Reader::DocUcs2Reader(char *buffer, std::size_t maxSize) : DocReader(buffer, maxSize) { +} + +DocUcs2Reader::~DocUcs2Reader() { +} + +void DocUcs2Reader::ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) { + if (myActualSize < myMaxSize) { + char buffer[4]; + const std::size_t dataLength = ZLUnicodeUtil::ucs2ToUtf8(buffer, symbol); + const std::size_t len = std::min(dataLength, myMaxSize - myActualSize); + std::strncpy(myBuffer + myActualSize, buffer, len); + myActualSize += len; + } +} + +DocStream::DocStream(const ZLFile& file, std::size_t maxSize) : myFile(file), myBuffer(0), mySize(maxSize) { +} + +DocStream::~DocStream() { + close(); +} + +bool DocStream::open() { + if (mySize != 0) { + myBuffer = new char[mySize]; + } + shared_ptr<DocReader> reader = createReader(myBuffer, mySize); + shared_ptr<ZLInputStream> stream = myFile.inputStream(); + if (stream.isNull() || !stream->open()) { + return false; + } + if (!reader->readDocument(stream, false)) { + return false; + } + mySize = reader->readSize(); + myOffset = 0; + return true; +} + +std::size_t DocStream::read(char *buffer, std::size_t maxSize) { + maxSize = std::min(maxSize, mySize - myOffset); + if (buffer != 0 && myBuffer != 0) { + std::memcpy(buffer, myBuffer + myOffset, maxSize); + } + myOffset += maxSize; + return maxSize; +} + +void DocStream::close() { + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void DocStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min(mySize, (std::size_t)std::max(0, offset)); +} + +std::size_t DocStream::offset() const { + return myOffset; +} + +std::size_t DocStream::sizeOfOpened() { + return mySize; +} + +DocAnsiStream::DocAnsiStream(const ZLFile& file, std::size_t maxSize) : DocStream(file, maxSize) { +} + +DocAnsiStream::~DocAnsiStream() { +} + +shared_ptr<DocReader> DocAnsiStream::createReader(char *buffer, std::size_t maxSize) { + return new DocAnsiReader(buffer, maxSize); +} + +DocUcs2Stream::DocUcs2Stream(const ZLFile& file, std::size_t maxSize) : DocStream(file, maxSize) { +} + +DocUcs2Stream::~DocUcs2Stream() { +} + +shared_ptr<DocReader> DocUcs2Stream::createReader(char *buffer, std::size_t maxSize) { + return new DocUcs2Reader(buffer, maxSize); +} diff --git a/reader/src/formats/doc/DocStreams.h b/reader/src/formats/doc/DocStreams.h new file mode 100644 index 0000000..4b1538a --- /dev/null +++ b/reader/src/formats/doc/DocStreams.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCSTREAMS_H__ +#define __DOCSTREAMS_H__ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +class DocReader; + +class DocStream : public ZLInputStream { + +public: + DocStream(const ZLFile& file, std::size_t maxSize); + ~DocStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +protected: + virtual shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize) = 0; + +private: + const ZLFile myFile; + char *myBuffer; + std::size_t mySize; + std::size_t myOffset; +}; + +class DocAnsiStream : public DocStream { + +public: + DocAnsiStream(const ZLFile& file, std::size_t maxSize); + ~DocAnsiStream(); + +private: + shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize); +}; + +class DocUcs2Stream : public DocStream { + +public: + DocUcs2Stream(const ZLFile& file, std::size_t maxSize); + ~DocUcs2Stream(); + +private: + shared_ptr<DocReader> createReader(char *buffer, std::size_t maxSize); +}; + +#endif /* __DOCSTREAMS_H__ */ diff --git a/reader/src/formats/doc/OleMainStream.cpp b/reader/src/formats/doc/OleMainStream.cpp new file mode 100644 index 0000000..fe829e6 --- /dev/null +++ b/reader/src/formats/doc/OleMainStream.cpp @@ -0,0 +1,1085 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <string> + +#include <ZLLogger.h> +#include <ZLUnicodeUtil.h> + +#include "OleUtil.h" +#include "OleStorage.h" + +#include "DocInlineImageReader.h" + +#include "OleMainStream.h" + +OleMainStream::Style::Style() : + StyleIdCurrent(STYLE_INVALID), + StyleIdNext(STYLE_INVALID), + HasPageBreakBefore(false), + BeforeParagraphIndent(0), + AfterParagraphIndent(0), + LeftIndent(0), + FirstLineIndent(0), + RightIndent(0), + Alignment(ALIGNMENT_DEFAULT) { +} + +OleMainStream::CharInfo::CharInfo() : FontStyle(FONT_REGULAR), FontSize(20) { +} + +OleMainStream::SectionInfo::SectionInfo() : CharPosition(0), IsNewPage(true) { +} + +OleMainStream::InlineImageInfo::InlineImageInfo() : DataPosition(0) { +} + +OleMainStream::FloatImageInfo::FloatImageInfo() : ShapeId(0) { +} + +OleMainStream::OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) : OleStream(storage, oleEntry, stream) { +} + +bool OleMainStream::open(bool doReadFormattingData) { + if (OleStream::open() == false) { + return false; + } + + static const std::size_t HEADER_SIZE = 768; //size of data in header of main stream + char headerBuffer[HEADER_SIZE]; + seek(0, true); + + if (read(headerBuffer, HEADER_SIZE) != HEADER_SIZE) { + return false; + } + + bool result = readFIB(headerBuffer); + if (!result) { + return false; + } + + // determining table stream number + unsigned int tableNumber = (OleUtil::getU2Bytes(headerBuffer, 0xA) & 0x0200) ? 1 : 0; + std::string tableName = tableNumber == 0 ? "0" : "1"; + tableName += "Table"; + OleEntry tableEntry; + result = myStorage->getEntryByName(tableName, tableEntry); + + if (!result) { + // cant't find table stream (that can be only in case if file format is below Word 7/8), so building simple table stream + // TODO: CHECK may be not all old documents have ANSI + ZLLogger::Instance().println("DocPlugin", "cant't find table stream, building own simple piece table, that includes all charachters"); + Piece piece = {myStartOfText, myEndOfText - myStartOfText, true, Piece::PIECE_TEXT, 0}; + myPieces.push_back(piece); + return true; + } + + result = readPieceTable(headerBuffer, tableEntry); + + if (!result) { + ZLLogger::Instance().println("DocPlugin", "error during reading piece table"); + return false; + } + + if (!doReadFormattingData) { + return true; + } + + OleEntry dataEntry; + if (myStorage->getEntryByName("Data", dataEntry)) { + myDataStream = new OleStream(myStorage, dataEntry, myBaseStream); + } + + //result of reading following structures doesn't check, because all these + //problems can be ignored, and document can be showed anyway, maybe with wrong formatting + readBookmarks(headerBuffer, tableEntry); + readStylesheet(headerBuffer, tableEntry); + //readSectionsInfoTable(headerBuffer, tableEntry); //it isn't used now + readParagraphStyleTable(headerBuffer, tableEntry); + readCharInfoTable(headerBuffer, tableEntry); + readFloatingImages(headerBuffer, tableEntry); + return true; +} + +const OleMainStream::Pieces &OleMainStream::getPieces() const { + return myPieces; +} + +const OleMainStream::CharInfoList &OleMainStream::getCharInfoList() const { + return myCharInfoList; +} + +const OleMainStream::StyleInfoList &OleMainStream::getStyleInfoList() const { + return myStyleInfoList; +} + +const OleMainStream::BookmarksList &OleMainStream::getBookmarks() const { + return myBookmarks; +} + +const OleMainStream::InlineImageInfoList &OleMainStream::getInlineImageInfoList() const { + return myInlineImageInfoList; +} + +const OleMainStream::FloatImageInfoList &OleMainStream::getFloatImageInfoList() const { + return myFloatImageInfoList; +} + +ZLFileImage::Blocks OleMainStream::getFloatImage(unsigned int shapeId) const { + if (myFLoatImageReader.isNull()) { + return ZLFileImage::Blocks(); + } + return myFLoatImageReader->getBlocksForShapeId(shapeId); +} + +ZLFileImage::Blocks OleMainStream::getInlineImage(unsigned int dataPosition) const { + if (myDataStream.isNull()) { + return ZLFileImage::Blocks(); + } + DocInlineImageReader imageReader(myDataStream); + return imageReader.getImagePieceInfo(dataPosition); +} + +bool OleMainStream::readFIB(const char *headerBuffer) { + int flags = OleUtil::getU2Bytes(headerBuffer, 0xA); //offset for flags + + if (flags & 0x0004) { //flag for complex format + ZLLogger::Instance().println("DocPlugin", "This was fast-saved. Some information is lost"); + //lostInfo = (flags & 0xF0) >> 4); + } + + if (flags & 0x1000) { //flag for using extending charset + ZLLogger::Instance().println("DocPlugin", "File uses extended character set (get_word8_char)"); + } else { + ZLLogger::Instance().println("DocPlugin", "File uses get_8bit_char character set"); + } + + if (flags & 0x100) { //flag for encrypted files + ZLLogger::Instance().println("DocPlugin", "File is encrypted"); + // Encryption key = %08lx ; NumUtil::get4Bytes(header, 14) + return false; + } + + unsigned int charset = OleUtil::getU2Bytes(headerBuffer, 0x14); //offset for charset number + if (charset && charset != 0x100) { //0x100 = default charset + ZLLogger::Instance().println("DocPlugin", "Using not default character set %d"); + } else { + ZLLogger::Instance().println("DocPlugin", "Using default character set"); + } + + myStartOfText = OleUtil::get4Bytes(headerBuffer, 0x18); //offset for start of text value + myEndOfText = OleUtil::get4Bytes(headerBuffer, 0x1c); //offset for end of text value + return true; +} + +void OleMainStream::splitPieces(const Pieces &s, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary) { + Pieces source = s; + dest1.clear(); + dest2.clear(); + + int sumLength = 0; + std::size_t i = 0; + for (i = 0; i < source.size(); ++i) { + Piece piece = source.at(i); + if (piece.Length + sumLength >= boundary) { + Piece piece2 = piece; + + piece.Length = boundary - sumLength; + piece.Type = type1; + + piece2.Type = type2; + piece2.Offset += piece.Length * 2; + piece2.Length -= piece.Length; + + if (piece.Length > 0) { + dest1.push_back(piece); + } + if (piece2.Length > 0) { + dest2.push_back(piece2); + } + ++i; + break; + } + sumLength += piece.Length; + piece.Type = type1; + dest1.push_back(piece); + } + for (; i < source.size(); ++i) { + Piece piece = source.at(i); + piece.Type = type2; + dest2.push_back(piece); + } + +} + +std::string OleMainStream::getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream) { + unsigned int clxOffset = OleUtil::getU4Bytes(headerBuffer, 0x01A2); //offset for CLX structure + unsigned int clxLength = OleUtil::getU4Bytes(headerBuffer, 0x01A6); //offset for value of CLX structure length + + //1 step : loading CLX table from table stream + char *clxBuffer = new char[clxLength]; + if (!tableStream.seek(clxOffset, true)) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- error for seeking to CLX structure"); + return std::string(); + } + if (tableStream.read(clxBuffer, clxLength) != clxLength) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure length is invalid"); + return std::string(); + } + std::string clx(clxBuffer, clxLength); + delete[] clxBuffer; + + //2 step: searching for pieces table buffer at CLX + //(determines it by 0x02 as start symbol) + std::size_t from = 0; + std::size_t i; + std::string pieceTableBuffer; + while ((i = clx.find_first_of(0x02, from)) != std::string::npos) { + if (clx.size() < i + 1 + 4) { + ZLLogger::Instance().println("DocPlugin", "getPiecesTableBuffer -- CLX structure has invalid format"); + return std::string(); + } + unsigned int pieceTableLength = OleUtil::getU4Bytes(clx.c_str(), i + 1); + pieceTableBuffer = std::string(clx, i + 1 + 4); + if (pieceTableBuffer.length() != pieceTableLength) { + from = i + 1; + continue; + } + break; + } + return pieceTableBuffer; +} + + +bool OleMainStream::readPieceTable(const char *headerBuffer, const OleEntry &tableEntry) { + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string piecesTableBuffer = getPiecesTableBuffer(headerBuffer, tableStream); + + if (piecesTableBuffer.empty()) { + return false; + } + + //getting count of Character Positions for different types of subdocuments in Main Stream + int ccpText = OleUtil::get4Bytes(headerBuffer, 0x004C); //text + int ccpFtn = OleUtil::get4Bytes(headerBuffer, 0x0050); //footnote subdocument + int ccpHdd = OleUtil::get4Bytes(headerBuffer, 0x0054); //header subdocument + int ccpMcr = OleUtil::get4Bytes(headerBuffer, 0x0058); //macro subdocument + int ccpAtn = OleUtil::get4Bytes(headerBuffer, 0x005C); //comment subdocument + int ccpEdn = OleUtil::get4Bytes(headerBuffer, 0x0060); //endnote subdocument + int ccpTxbx = OleUtil::get4Bytes(headerBuffer, 0x0064); //textbox subdocument + int ccpHdrTxbx = OleUtil::get4Bytes(headerBuffer, 0x0068); //textbox subdocument of the header + int lastCP = ccpFtn + ccpHdd + ccpMcr + ccpAtn + ccpEdn + ccpTxbx + ccpHdrTxbx; + if (lastCP != 0) { + ++lastCP; + } + lastCP += ccpText; + + //getting the CP (character positions) and CP descriptors + std::vector<int> cp; //array of character positions for pieces + unsigned int j = 0; + for (j = 0; ; j += 4) { + if (piecesTableBuffer.size() < j + 4) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, cp ends not with a lastcp"); + break; + } + int curCP = OleUtil::get4Bytes(piecesTableBuffer.c_str(), j); + cp.push_back(curCP); + if (curCP == lastCP) { + break; + } + } + + if (cp.size() < 2) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, < 2 pieces"); + return false; + } + + std::vector<std::string> descriptors; + for (std::size_t k = 0; k < cp.size() - 1; ++k) { + //j + 4, because it should be taken after CP in PiecesTable Buffer + //k * 8, because it should be taken 8 byte for each descriptor + std::size_t substrFrom = j + 4 + k * 8; + if (piecesTableBuffer.size() < substrFrom + 8) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, problems with descriptors reading"); + break; + } + descriptors.push_back(piecesTableBuffer.substr(substrFrom, 8)); + } + + //filling the Pieces vector + std::size_t minValidSize = std::min(cp.size() - 1, descriptors.size()); + if (minValidSize == 0) { + ZLLogger::Instance().println("DocPlugin", "invalid piece table, there are no pieces"); + return false; + } + + for (std::size_t i = 0; i < minValidSize; ++i) { + //4byte integer with offset and ANSI flag + int fcValue = OleUtil::get4Bytes(descriptors.at(i).c_str(), 0x2); //offset for piece structure + Piece piece; + piece.IsANSI = (fcValue & 0x40000000) == 0x40000000; //ansi flag + piece.Offset = fcValue & 0x3FFFFFFF; //gettting offset for current piece + piece.Length = cp.at(i + 1) - cp.at(i); + myPieces.push_back(piece); + } + + //split pieces into different types + Pieces piecesText, piecesFootnote, piecesOther; + splitPieces(myPieces, piecesText, piecesFootnote, Piece::PIECE_TEXT, Piece::PIECE_FOOTNOTE, ccpText); + splitPieces(piecesFootnote, piecesFootnote, piecesOther, Piece::PIECE_FOOTNOTE, Piece::PIECE_OTHER, ccpFtn); + + myPieces.clear(); + for (std::size_t i = 0; i < piecesText.size(); ++i) { + myPieces.push_back(piecesText.at(i)); + } + for (std::size_t i = 0; i < piecesFootnote.size(); ++i) { + myPieces.push_back(piecesFootnote.at(i)); + } + for (std::size_t i = 0; i < piecesOther.size(); ++i) { + myPieces.push_back(piecesOther.at(i)); + } + + //converting length and offset depending on isANSI + for (std::size_t i = 0; i < myPieces.size(); ++i) { + Piece &piece = myPieces.at(i); + if (!piece.IsANSI) { + piece.Length *= 2; + } else { + piece.Offset /= 2; + } + } + + //filling startCP field + unsigned int curStartCP = 0; + for (std::size_t i = 0; i < myPieces.size(); ++i) { + Piece &piece = myPieces.at(i); + piece.startCP = curStartCP; + if (piece.IsANSI) { + curStartCP += piece.Length; + } else { + curStartCP += piece.Length / 2; + } + } + return true; +} + +bool OleMainStream::readBookmarks(const char *headerBuffer, const OleEntry &tableEntry) { + //SttbfBkmk structure is a table of bookmark name strings + unsigned int beginNamesInfo = OleUtil::getU4Bytes(headerBuffer, 0x142); // address of SttbfBkmk structure + std::size_t namesInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x146); // length of SttbfBkmk structure + + if (namesInfoLength == 0) { + return true; //there's no bookmarks + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginNamesInfo, namesInfoLength, tableStream)) { + return false; + } + + unsigned int recordsNumber = OleUtil::getU2Bytes(buffer.c_str(), 0x2); //count of records + + std::vector<std::string> names; + unsigned int offset = 0x6; //initial offset + for (unsigned int i = 0; i < recordsNumber; ++i) { + if (buffer.size() < offset + 2) { + ZLLogger::Instance().println("DocPlugin", "problmes with reading bookmarks names"); + break; + } + unsigned int length = OleUtil::getU2Bytes(buffer.c_str(), offset) * 2; //length of string in bytes + ZLUnicodeUtil::Ucs2String name; + for (unsigned int j = 0; j < length; j+=2) { + char ch1 = buffer.at(offset + 2 + j); + char ch2 = buffer.at(offset + 2 + j + 1); + ZLUnicodeUtil::Ucs2Char ucs2Char = (unsigned int)ch1 | ((unsigned int)ch2 << 8); + name.push_back(ucs2Char); + } + std::string utf8Name; + ZLUnicodeUtil::ucs2ToUtf8(utf8Name, name); + names.push_back(utf8Name); + offset += length + 2; + } + + //plcfBkmkf structure is table recording beginning CPs of bookmarks + unsigned int beginCharPosInfo = OleUtil::getU4Bytes(headerBuffer, 0x14A); // address of plcfBkmkf structure + std::size_t charPosInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x14E); // length of plcfBkmkf structure + + if (charPosInfoLen == 0) { + return true; //there's no bookmarks + } + + if (!readToBuffer(buffer, beginCharPosInfo, charPosInfoLen, tableStream)) { + return false; + } + + static const unsigned int BKF_SIZE = 4; + std::size_t size = calcCountOfPLC(charPosInfoLen, BKF_SIZE); + std::vector<unsigned int> charPage; + for (std::size_t index = 0, offset = 0; index < size; ++index, offset += 4) { + charPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset)); + } + + for (std::size_t i = 0; i < names.size(); ++i) { + if (i >= charPage.size()) { + break; //for the case if something in these structures goes wrong, to not to lose all bookmarks + } + Bookmark bookmark; + bookmark.CharPosition = charPage.at(i); + bookmark.Name = names.at(i); + myBookmarks.push_back(bookmark); + } + + return true; +} + +bool OleMainStream::readStylesheet(const char *headerBuffer, const OleEntry &tableEntry) { + //STSH structure is a stylesheet + unsigned int beginStshInfo = OleUtil::getU4Bytes(headerBuffer, 0xa2); // address of STSH structure + std::size_t stshInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xa6); // length of STSH structure + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + char *buffer = new char[stshInfoLength]; + if (!tableStream.seek(beginStshInfo, true)) { + ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure"); + return false; + } + if (tableStream.read(buffer, stshInfoLength) != stshInfoLength) { + ZLLogger::Instance().println("DocPlugin", "problems with reading STSH structure, invalid length"); + return false; + } + + std::size_t stdCount = (std::size_t)OleUtil::getU2Bytes(buffer, 2); + std::size_t stdBaseInFile = (std::size_t)OleUtil::getU2Bytes(buffer, 4); + myStyleSheet.resize(stdCount); + + std::vector<bool> isFilled; + isFilled.resize(stdCount, false); + + std::size_t stdLen = 0; + bool styleSheetWasChanged = false; + do { //make it in while loop, because some base style can be after their successors + styleSheetWasChanged = false; + for (std::size_t index = 0, offset = 2 + (std::size_t)OleUtil::getU2Bytes(buffer, 0); index < stdCount; index++, offset += 2 + stdLen) { + stdLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset); + if (isFilled.at(index)) { + continue; + } + + if (stdLen == 0) { + //if record is empty, left it default + isFilled[index] = true; + continue; + } + + Style styleInfo = myStyleSheet.at(index); + + const unsigned int styleAndBaseType = OleUtil::getU2Bytes(buffer, offset + 4); + const unsigned int styleType = styleAndBaseType % 16; + const unsigned int baseStyleId = styleAndBaseType / 16; + if (baseStyleId == Style::STYLE_NIL || baseStyleId == Style::STYLE_USER) { + //if based on nil or user style, left default + } else { + int baseStyleIndex = getStyleIndex(baseStyleId, isFilled, myStyleSheet); + if (baseStyleIndex < 0) { + //this base style is not filled yet, so pass it at some time + continue; + } + styleInfo = myStyleSheet.at(baseStyleIndex); + styleInfo.StyleIdCurrent = Style::STYLE_INVALID; + } + + // parse STD structure + unsigned int tmp = OleUtil::getU2Bytes(buffer, offset + 6); + unsigned int upxCount = tmp % 16; + styleInfo.StyleIdNext = tmp / 16; + + //adding current style + myStyleSheet[index] = styleInfo; + isFilled[index] = true; + styleSheetWasChanged = true; + + std::size_t pos = 2 + stdBaseInFile; + std::size_t nameLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + nameLen = nameLen * 2 + 2; //from Unicode characters to bytes + Unicode null charachter length + pos += 2 + nameLen; + if (pos % 2 != 0) { + ++pos; + } + if (pos >= stdLen) { + continue; + } + std::size_t upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + if (pos + upxLen > stdLen) { + //UPX length too large + continue; + } + //for style info styleType must be equal 1 + if (styleType == 1 && upxCount >= 1) { + if (upxLen >= 2) { + styleInfo.StyleIdCurrent = OleUtil::getU2Bytes(buffer, offset + pos + 2); + getStyleInfo(0, buffer + offset + pos + 4, upxLen - 2, styleInfo); + myStyleSheet[index] = styleInfo; + } + pos += 2 + upxLen; + if (pos % 2 != 0) { + ++pos; + } + upxLen = (std::size_t)OleUtil::getU2Bytes(buffer, offset + pos); + } + if (upxLen == 0 || pos + upxLen > stdLen) { + //too small/too large + continue; + } + //for char info styleType can be equal 1 or 2 + if ((styleType == 1 && upxCount >= 2) || (styleType == 2 && upxCount >= 1)) { + CharInfo charInfo; + getCharInfo(0, Style::STYLE_INVALID, buffer + offset + pos + 2, upxLen, charInfo); + styleInfo.CurrentCharInfo = charInfo; + myStyleSheet[index] = styleInfo; + } + } + } while (styleSheetWasChanged); + delete[] buffer; + return true; +} + +bool OleMainStream::readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcfbteChpx structure is table with formatting for particular run of text + unsigned int beginCharInfo = OleUtil::getU4Bytes(headerBuffer, 0xfa); // address of PlcfbteChpx structure + std::size_t charInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xfe); // length of PlcfbteChpx structure + if (charInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginCharInfo, charInfoLength, tableStream)) { + return false; + } + + static const unsigned int CHPX_SIZE = 4; + std::size_t size = calcCountOfPLC(charInfoLength, CHPX_SIZE); + std::vector<unsigned int> charBlocks; + for (std::size_t index = 0, offset = (size + 1) * 4; index < size; ++index, offset += CHPX_SIZE) { + charBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), offset)); + } + + char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE]; + for (std::size_t index = 0; index < charBlocks.size(); ++index) { + seek(charBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true); + if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) { + return false; + } + unsigned int crun = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with crun (count of 'run of text') + for (unsigned int index2 = 0; index2 < crun; ++index2) { + unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4); + unsigned int chpxOffset = 2 * OleUtil::getU1Byte(formatPageBuffer, (crun + 1) * 4 + index2); + unsigned int len = OleUtil::getU1Byte(formatPageBuffer, chpxOffset); + unsigned int charPos = 0; + if (!offsetToCharPos(offset, charPos, myPieces)) { + continue; + } + unsigned int styleId = getStyleIdByCharPos(charPos, myStyleInfoList); + + CharInfo charInfo = getStyleFromStylesheet(styleId, myStyleSheet).CurrentCharInfo; + if (chpxOffset != 0) { + getCharInfo(chpxOffset, styleId, formatPageBuffer + 1, len - 1, charInfo); + } + myCharInfoList.push_back(CharPosToCharInfo(charPos, charInfo)); + + if (chpxOffset != 0) { + InlineImageInfo pictureInfo; + if (getInlineImageInfo(chpxOffset, formatPageBuffer + 1, len - 1, pictureInfo)) { + myInlineImageInfoList.push_back(CharPosToInlineImageInfo(charPos, pictureInfo)); + } + } + + } + } + delete[] formatPageBuffer; + return true; +} + +bool OleMainStream::readFloatingImages(const char *headerBuffer, const OleEntry &tableEntry) { + //Plcspa structure is a table with information for FSPA (File Shape Address) + unsigned int beginPicturesInfo = OleUtil::getU4Bytes(headerBuffer, 0x01DA); // address of Plcspa structure + if (beginPicturesInfo == 0) { + return true; //there's no office art objects + } + unsigned int picturesInfoLength = OleUtil::getU4Bytes(headerBuffer, 0x01DE); // length of Plcspa structure + if (picturesInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginPicturesInfo, picturesInfoLength, tableStream)) { + return false; + } + + static const unsigned int SPA_SIZE = 26; + std::size_t size = calcCountOfPLC(picturesInfoLength, SPA_SIZE); + + std::vector<unsigned int> picturesBlocks; + for (std::size_t index = 0, tOffset = 0; index < size; ++index, tOffset += 4) { + picturesBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset)); + } + + for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += SPA_SIZE) { + unsigned int spid = OleUtil::getU4Bytes(buffer.c_str(), tOffset); + FloatImageInfo info; + unsigned int charPos = picturesBlocks.at(index); + info.ShapeId = spid; + myFloatImageInfoList.push_back(CharPosToFloatImageInfo(charPos, info)); + } + + //DggInfo structure is office art object table data + unsigned int beginOfficeArtContent = OleUtil::getU4Bytes(headerBuffer, 0x22A); // address of DggInfo structure + if (beginOfficeArtContent == 0) { + return true; //there's no office art objects + } + unsigned int officeArtContentLength = OleUtil::getU4Bytes(headerBuffer, 0x022E); // length of DggInfo structure + if (officeArtContentLength < 4) { + return false; + } + + shared_ptr<OleStream> newTableStream = new OleStream(myStorage, tableEntry, myBaseStream); + shared_ptr<OleStream> newMainStream = new OleStream(myStorage, myOleEntry, myBaseStream); + if (newTableStream->open() && newMainStream->open()) { + myFLoatImageReader = new DocFloatImageReader(beginOfficeArtContent, officeArtContentLength, newTableStream, newMainStream); + myFLoatImageReader->readAll(); + } + return true; +} + +bool OleMainStream::readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcBtePapx structure is table with formatting for all paragraphs + unsigned int beginParagraphInfo = OleUtil::getU4Bytes(headerBuffer, 0x102); // address of PlcBtePapx structure + std::size_t paragraphInfoLength = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0x106); // length of PlcBtePapx structure + if (paragraphInfoLength < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginParagraphInfo, paragraphInfoLength, tableStream)) { + return false; + } + + static const unsigned int PAPX_SIZE = 4; + std::size_t size = calcCountOfPLC(paragraphInfoLength, PAPX_SIZE); + + std::vector<unsigned int> paragraphBlocks; + for (std::size_t index = 0, tOffset = (size + 1) * 4; index < size; ++index, tOffset += PAPX_SIZE) { + paragraphBlocks.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset)); + } + + char *formatPageBuffer = new char[OleStorage::BBD_BLOCK_SIZE]; + for (std::size_t index = 0; index < paragraphBlocks.size(); ++index) { + seek(paragraphBlocks.at(index) * OleStorage::BBD_BLOCK_SIZE, true); + if (read(formatPageBuffer, OleStorage::BBD_BLOCK_SIZE) != OleStorage::BBD_BLOCK_SIZE) { + return false; + } + const unsigned int paragraphsCount = OleUtil::getU1Byte(formatPageBuffer, 0x1ff); //offset with 'cpara' value (count of paragraphs) + for (unsigned int index2 = 0; index2 < paragraphsCount; ++index2) { + const unsigned int offset = OleUtil::getU4Bytes(formatPageBuffer, index2 * 4); + unsigned int papxOffset = OleUtil::getU1Byte(formatPageBuffer, (paragraphsCount + 1) * 4 + index2 * 13) * 2; + if (papxOffset <= 0) { + continue; + } + unsigned int len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2; + if (len == 0) { + ++papxOffset; + len = OleUtil::getU1Byte(formatPageBuffer, papxOffset) * 2; + } + + const unsigned int styleId = OleUtil::getU2Bytes(formatPageBuffer, papxOffset + 1); + Style styleInfo = getStyleFromStylesheet(styleId, myStyleSheet); + + if (len >= 3) { + getStyleInfo(papxOffset, formatPageBuffer + 3, len - 3, styleInfo); + } + + unsigned int charPos = 0; + if (!offsetToCharPos(offset, charPos, myPieces)) { + continue; + } + myStyleInfoList.push_back(CharPosToStyle(charPos, styleInfo)); + } + } + delete[] formatPageBuffer; + return true; +} + +bool OleMainStream::readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry) { + //PlcfSed structure is a section table + unsigned int beginOfText = OleUtil::getU4Bytes(headerBuffer, 0x18); //address of text's begin in main stream + unsigned int beginSectInfo = OleUtil::getU4Bytes(headerBuffer, 0xca); //address if PlcfSed structure + + std::size_t sectInfoLen = (std::size_t)OleUtil::getU4Bytes(headerBuffer, 0xce); //length of PlcfSed structure + if (sectInfoLen < 4) { + return false; + } + + OleStream tableStream(myStorage, tableEntry, myBaseStream); + std::string buffer; + if (!readToBuffer(buffer, beginSectInfo, sectInfoLen, tableStream)) { + return false; + } + + static const unsigned int SED_SIZE = 12; + std::size_t decriptorsCount = calcCountOfPLC(sectInfoLen, SED_SIZE); + + //saving the section offsets (in character positions) + std::vector<unsigned int> charPos; + for (std::size_t index = 0, tOffset = 0; index < decriptorsCount; ++index, tOffset += 4) { + unsigned int ulTextOffset = OleUtil::getU4Bytes(buffer.c_str(), tOffset); + charPos.push_back(beginOfText + ulTextOffset); + } + + //saving sepx offsets + std::vector<unsigned int> sectPage; + for (std::size_t index = 0, tOffset = (decriptorsCount + 1) * 4; index < decriptorsCount; ++index, tOffset += SED_SIZE) { + sectPage.push_back(OleUtil::getU4Bytes(buffer.c_str(), tOffset + 2)); + } + + //reading the section properties + char tmpBuffer[2]; + for (std::size_t index = 0; index < sectPage.size(); ++index) { + if (sectPage.at(index) == 0xffffffffUL) { //check for invalid record, to make default section info + SectionInfo sectionInfo; + sectionInfo.CharPosition = charPos.at(index); + mySectionInfoList.push_back(sectionInfo); + continue; + } + //getting number of bytes to read + if (!seek(sectPage.at(index), true)) { + continue; + } + if (read(tmpBuffer, 2) != 2) { + continue; + } + std::size_t bytes = 2 + (std::size_t)OleUtil::getU2Bytes(tmpBuffer, 0); + + if (!seek(sectPage.at(index), true)) { + continue; + } + char *formatPageBuffer = new char[bytes]; + if (read(formatPageBuffer, bytes) != bytes) { + delete[] formatPageBuffer; + continue; + } + SectionInfo sectionInfo; + sectionInfo.CharPosition = charPos.at(index); + getSectionInfo(formatPageBuffer + 2, bytes - 2, sectionInfo); + mySectionInfoList.push_back(sectionInfo); + delete[] formatPageBuffer; + } + return true; +} + +void OleMainStream::getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo) { + int tmp, toDelete, toAdd; + unsigned int offset = 0; + while (bytes >= offset + 2) { + unsigned int curPrlLength = 0; + switch (OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset)) { + case 0x2403: + styleInfo.Alignment = (Style::AlignmentType)OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x4610: + styleInfo.LeftIndent += OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + if (styleInfo.LeftIndent < 0) { + styleInfo.LeftIndent = 0; + } + break; + case 0xc60d: // ChgTabsPapx + case 0xc615: // ChgTabs + tmp = OleUtil::get1Byte(grpprlBuffer, papxOffset + offset + 2); + if (tmp < 2) { + curPrlLength = 1; + break; + } + toDelete = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 3); + if (tmp < 2 + 2 * toDelete) { + curPrlLength = 1; + break; + } + toAdd = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 4 + 2 * toDelete); + if (tmp < 2 + 2 * toDelete + 2 * toAdd) { + curPrlLength = 1; + break; + } + break; + case 0x840e: + styleInfo.RightIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x840f: + styleInfo.LeftIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x8411: + styleInfo.FirstLineIndent = (int)OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0xa413: + styleInfo.BeforeParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0xa414: + styleInfo.AfterParagraphIndent = OleUtil::getU2Bytes(grpprlBuffer, papxOffset + offset + 2); + break; + case 0x2407: + styleInfo.HasPageBreakBefore = OleUtil::getU1Byte(grpprlBuffer, papxOffset + offset + 2) == 0x01; + break; + default: + break; + } + if (curPrlLength == 0) { + curPrlLength = getPrlLength(grpprlBuffer, papxOffset + offset); + } + offset += curPrlLength; + } + +} + +void OleMainStream::getCharInfo(unsigned int chpxOffset, unsigned int /*styleId*/, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo) { + unsigned int sprm = 0; //single propery modifier + unsigned int offset = 0; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) { + case 0x0835: //bold + sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2); + switch (sprm) { + case UNSET: + charInfo.FontStyle &= ~CharInfo::FONT_BOLD; + break; + case SET: + charInfo.FontStyle |= CharInfo::FONT_BOLD; + break; + case UNCHANGED: + break; + case NEGATION: + charInfo.FontStyle ^= CharInfo::FONT_BOLD; + break; + default: + break; + } + break; + case 0x0836: //italic + sprm = OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2); + switch (sprm) { + case UNSET: + charInfo.FontStyle &= ~CharInfo::FONT_ITALIC; + break; + case SET: + charInfo.FontStyle |= CharInfo::FONT_ITALIC; + break; + case UNCHANGED: + break; + case NEGATION: + charInfo.FontStyle ^= CharInfo::FONT_ITALIC; + break; + default: + break; + } + break; + case 0x4a43: //size of font + charInfo.FontSize = OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset + 2); + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, chpxOffset + offset); + } + +} + +void OleMainStream::getSectionInfo(const char *grpprlBuffer, std::size_t bytes, SectionInfo §ionInfo) { + unsigned int tmp; + std::size_t offset = 0; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, offset)) { + case 0x3009: //new page + tmp = OleUtil::getU1Byte(grpprlBuffer, offset + 2); + sectionInfo.IsNewPage = (tmp != 0 && tmp != 1); + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, offset); + } +} + +bool OleMainStream::getInlineImageInfo(unsigned int chpxOffset, const char *grpprlBuffer, unsigned int bytes, InlineImageInfo &pictureInfo) { + //p. 105 of [MS-DOC] documentation + unsigned int offset = 0; + bool isFound = false; + while (bytes >= offset + 2) { + switch (OleUtil::getU2Bytes(grpprlBuffer, chpxOffset + offset)) { + case 0x080a: // ole object, p.107 [MS-DOC] + if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) == 0x01) { + return false; + } + break; + case 0x0806: // is not a picture, but a binary data? (sprmCFData, p.106 [MS-DOC]) + if (OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2) == 0x01) { + return false; + } + break; +// case 0x0855: // sprmCFSpec, p.117 [MS-DOC], MUST BE applied with a value of 1 (see p.105 [MS-DOC]) +// if (OleUtil::getU1Byte(grpprlBuffer, chpxOffset + offset + 2) != 0x01) { +// return false; +// } +// break; + case 0x6a03: // location p.105 [MS-DOC] + pictureInfo.DataPosition = OleUtil::getU4Bytes(grpprlBuffer, chpxOffset + offset + 2); + isFound = true; + break; + default: + break; + } + offset += getPrlLength(grpprlBuffer, chpxOffset + offset); + } + return isFound; +} + +OleMainStream::Style OleMainStream::getStyleFromStylesheet(unsigned int styleId, const StyleSheet &stylesheet) { + //TODO optimize it: StyleSheet can be map structure with styleId key + Style style; + if (styleId != Style::STYLE_INVALID && styleId != Style::STYLE_NIL && styleId != Style::STYLE_USER) { + for (std::size_t index = 0; index < stylesheet.size(); ++index) { + if (stylesheet.at(index).StyleIdCurrent == styleId) { + return stylesheet.at(index); + } + } + } + style.StyleIdCurrent = styleId; + return style; +} + +int OleMainStream::getStyleIndex(unsigned int styleId, const std::vector<bool> &isFilled, const StyleSheet &stylesheet) { + //TODO optimize it: StyleSheet can be map structure with styleId key + //in that case, this method will be excess + if (styleId == Style::STYLE_INVALID) { + return -1; + } + for (int index = 0; index < (int)stylesheet.size(); ++index) { + if (isFilled.at(index) && stylesheet.at(index).StyleIdCurrent == styleId) { + return index; + } + } + return -1; +} + +unsigned int OleMainStream::getStyleIdByCharPos(unsigned int charPos, const StyleInfoList &styleInfoList) { + unsigned int styleId = Style::STYLE_INVALID; + for (std::size_t i = 0; i < styleInfoList.size(); ++i) { + const Style &info = styleInfoList.at(i).second; + if (i == styleInfoList.size() - 1) { //if last + styleId = info.StyleIdCurrent; + break; + } + unsigned int curOffset = styleInfoList.at(i).first; + unsigned int nextOffset = styleInfoList.at(i + 1).first; + if (charPos >= curOffset && charPos < nextOffset) { + styleId = info.StyleIdCurrent; + break; + } + } + return styleId; +} + +bool OleMainStream::offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces) { + if (pieces.empty()) { + return false; + } + if ((unsigned int)pieces.front().Offset > offset) { + charPos = 0; + return true; + } + if ((unsigned int)(pieces.back().Offset + pieces.back().Length) <= offset) { + return false; + } + + std::size_t pieceNumber = 0; + for (std::size_t i = 0; i < pieces.size(); ++i) { + if (i == pieces.size() - 1) { //if last + pieceNumber = i; + break; + } + unsigned int curOffset = pieces.at(i).Offset; + unsigned int nextOffset = pieces.at(i + 1).Offset; + if (offset >= curOffset && offset < nextOffset) { + pieceNumber = i; + break; + } + } + + const Piece &piece = pieces.at(pieceNumber); + unsigned int diffOffset = offset - piece.Offset; + if (!piece.IsANSI) { + diffOffset /= 2; + } + charPos = piece.startCP + diffOffset; + return true; +} + +bool OleMainStream::readToBuffer(std::string &result, unsigned int offset, std::size_t length, OleStream &stream) { + char *buffer = new char[length]; + stream.seek(offset, true); + if (stream.read(buffer, length) != length) { + return false; + } + result = std::string(buffer, length); + delete[] buffer; + return true; +} + +unsigned int OleMainStream::calcCountOfPLC(unsigned int totalSize, unsigned int elementSize) { + //calculates count of elements in PLC structure, formula from p.30 [MS-DOC] + return (totalSize - 4) / (4 + elementSize); +} + +unsigned int OleMainStream::getPrlLength(const char *grpprlBuffer, unsigned int byteNumber) { + unsigned int tmp; + unsigned int opCode = OleUtil::getU2Bytes(grpprlBuffer, byteNumber); + switch (opCode & 0xe000) { + case 0x0000: + case 0x2000: + return 3; + case 0x4000: + case 0x8000: + case 0xA000: + return 4; + case 0xE000: + return 5; + case 0x6000: + return 6; + case 0xC000: + //counting of info length + tmp = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 2); + if (opCode == 0xc615 && tmp == 255) { + unsigned int del = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 3); + unsigned int add = OleUtil::getU1Byte(grpprlBuffer, byteNumber + 4 + del * 4); + tmp = 2 + del * 4 + add * 3; + } + return 3 + tmp; + default: + return 1; + } +} diff --git a/reader/src/formats/doc/OleMainStream.h b/reader/src/formats/doc/OleMainStream.h new file mode 100644 index 0000000..378f037 --- /dev/null +++ b/reader/src/formats/doc/OleMainStream.h @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLEMAINSTREAM_H__ +#define __OLEMAINSTREAM_H__ + +#include <vector> +#include <string> + +#include "OleStream.h" +#include "DocFloatImageReader.h" + +class OleMainStream : public OleStream { + +public: + struct Piece { + enum PieceType { + PIECE_TEXT, + PIECE_FOOTNOTE, + PIECE_OTHER + }; + + int Offset; // TODO: maybe make it unsigned int + int Length; // TODO: maybe make it unsigned int + bool IsANSI; + PieceType Type; + unsigned int startCP; + }; + typedef std::vector<Piece> Pieces; + + struct CharInfo { + enum Font { + FONT_REGULAR = 0, + FONT_BOLD = 1 << 0, + FONT_ITALIC = 1 << 1, + FONT_UNDERLINE = 1 << 2, + FONT_CAPITALS = 1 << 3, + FONT_SMALL_CAPS = 1 << 4, + FONT_STRIKE = 1 << 5, + FONT_HIDDEN = 1 << 6, + FONT_MARKDEL = 1 << 7, + FONT_SUPERSCRIPT = 1 << 8, + FONT_SUBSCRIPT = 1 << 9 + }; + + unsigned int FontStyle; + unsigned int FontSize; + + CharInfo(); + }; + typedef std::pair<unsigned int, CharInfo> CharPosToCharInfo; + typedef std::vector<CharPosToCharInfo > CharInfoList; + + struct Style { + enum AlignmentType { + ALIGNMENT_LEFT = 0x00, + ALIGNMENT_CENTER = 0x01, + ALIGNMENT_RIGHT = 0x02, + ALIGNMENT_JUSTIFY = 0x03, + ALIGNMENT_DEFAULT // for case if alignment is not setted by word + }; + + // style Ids: + // (this is not full list of possible style ids, enum is used for using in switch-case) + enum StyleID { + STYLE_H1 = 0x1, + STYLE_H2 = 0x2, + STYLE_H3 = 0x3, + STYLE_USER = 0xFFE, + STYLE_NIL = 0xFFF, + STYLE_INVALID = 0xFFFF + }; + + unsigned int StyleIdCurrent; + unsigned int StyleIdNext; // Next style unless overruled + + bool HasPageBreakBefore; + unsigned int BeforeParagraphIndent; // Vertical indent before paragraph, pixels + unsigned int AfterParagraphIndent; // Vertical indent after paragraph, pixels + int LeftIndent; + int FirstLineIndent; + int RightIndent; + AlignmentType Alignment; + CharInfo CurrentCharInfo; + + Style(); + }; + + typedef std::pair<unsigned int, Style> CharPosToStyle; + typedef std::vector<CharPosToStyle> StyleInfoList; + typedef std::vector<Style> StyleSheet; + + struct SectionInfo { + unsigned int CharPosition; + bool IsNewPage; + + SectionInfo(); + }; + typedef std::vector<SectionInfo> SectionInfoList; + + struct Bookmark { + unsigned int CharPosition; + std::string Name; + }; + typedef std::vector<Bookmark> BookmarksList; + + struct InlineImageInfo { + unsigned int DataPosition; + + InlineImageInfo(); + }; + typedef std::pair<unsigned int, InlineImageInfo> CharPosToInlineImageInfo; + typedef std::vector<CharPosToInlineImageInfo> InlineImageInfoList; + + struct FloatImageInfo { + unsigned int ShapeId; + FloatImageInfo(); + }; + typedef std::pair<unsigned int, FloatImageInfo> CharPosToFloatImageInfo; + typedef std::vector<CharPosToFloatImageInfo> FloatImageInfoList; + + enum ImageType { //see p. 60 [MS-ODRAW] + IMAGE_EMF = 0xF01A, + IMAGE_WMF = 0xF01B, + IMAGE_PICT = 0xF01C, + IMAGE_JPEG = 0xF01D, + IMAGE_PNG = 0xF01E, + IMAGE_DIB = 0xF01F, + IMAGE_TIFF = 0xF029, + IMAGE_JPEG2 = 0xF02A + }; + +public: + OleMainStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream); + +public: + bool open(bool doReadFormattingData); + const Pieces &getPieces() const; + const CharInfoList &getCharInfoList() const; + const StyleInfoList &getStyleInfoList() const; + const BookmarksList &getBookmarks() const; + const InlineImageInfoList &getInlineImageInfoList() const; + const FloatImageInfoList &getFloatImageInfoList() const; + + ZLFileImage::Blocks getFloatImage(unsigned int shapeId) const; + ZLFileImage::Blocks getInlineImage(unsigned int dataPos) const; + +private: + bool readFIB(const char *headerBuffer); + bool readPieceTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readBookmarks(const char *headerBuffer, const OleEntry &tableEntry); + bool readStylesheet(const char *headerBuffer, const OleEntry &tableEntry); + bool readSectionsInfoTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readParagraphStyleTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readCharInfoTable(const char *headerBuffer, const OleEntry &tableEntry); + bool readFloatingImages(const char *headerBuffer, const OleEntry &tableEntry); + +private: //readPieceTable helpers methods + static std::string getPiecesTableBuffer(const char *headerBuffer, OleStream &tableStream); + static void splitPieces(const Pieces &source, Pieces &dest1, Pieces &dest2, Piece::PieceType type1, Piece::PieceType type2, int boundary); + +private: //formatting reader helpers methods + static unsigned int getPrlLength(const char *grpprlBuffer, unsigned int byteNumber); + static void getCharInfo(unsigned int chpxOffset, unsigned int styleId, const char *grpprlBuffer, unsigned int bytes, CharInfo &charInfo); + static void getStyleInfo(unsigned int papxOffset, const char *grpprlBuffer, unsigned int bytes, Style &styleInfo); + static void getSectionInfo(const char *grpprlBuffer, std::size_t bytes, SectionInfo §ionInfo); + static bool getInlineImageInfo(unsigned int chpxOffset, const char *grpprlBuffer, unsigned int bytes, InlineImageInfo &pictureInfo); + + static Style getStyleFromStylesheet(unsigned int styleId, const StyleSheet &stylesheet); + static int getStyleIndex(unsigned int styleId, const std::vector<bool> &isFilled, const StyleSheet &stylesheet); + static unsigned int getStyleIdByCharPos(unsigned int offset, const StyleInfoList &styleInfoList); + + static bool offsetToCharPos(unsigned int offset, unsigned int &charPos, const Pieces &pieces); + static bool readToBuffer(std::string &result, unsigned int offset, std::size_t length, OleStream &stream); + + static unsigned int calcCountOfPLC(unsigned int totalSize, unsigned int elementSize); + +private: + enum PrlFlag { + UNSET = 0, + SET = 1, + UNCHANGED = 128, + NEGATION = 129 + }; + +private: + int myStartOfText; + int myEndOfText; + + Pieces myPieces; + + StyleSheet myStyleSheet; + + CharInfoList myCharInfoList; + StyleInfoList myStyleInfoList; + SectionInfoList mySectionInfoList; + InlineImageInfoList myInlineImageInfoList; + FloatImageInfoList myFloatImageInfoList; + + BookmarksList myBookmarks; + + shared_ptr<OleStream> myDataStream; + + shared_ptr<DocFloatImageReader> myFLoatImageReader; +}; + +#endif /* __OLEMAINSTREAM_H__ */ diff --git a/reader/src/formats/doc/OleStorage.cpp b/reader/src/formats/doc/OleStorage.cpp new file mode 100644 index 0000000..a7ab81a --- /dev/null +++ b/reader/src/formats/doc/OleStorage.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleStorage.h" +#include "OleUtil.h" + +#include <cstring> + +const std::size_t OleStorage::BBD_BLOCK_SIZE = 512; + +OleStorage::OleStorage() { + clear(); +} + +void OleStorage::clear() { + myInputStream = 0; + mySectorSize = 0; + myShortSectorSize = 0; + myStreamSize = 0; + myRootEntryIndex = -1; + + myDIFAT.clear(); + myBBD.clear(); + mySBD.clear(); + myProperties.clear(); + myEntries.clear(); +} + + + +bool OleStorage::init(shared_ptr<ZLInputStream> stream, std::size_t streamSize) { + clear(); + + myInputStream = stream; + myStreamSize = streamSize; + myInputStream->seek(0, true); + + char oleBuf[BBD_BLOCK_SIZE]; + std::size_t ret = myInputStream->read(oleBuf, BBD_BLOCK_SIZE); + if (ret != BBD_BLOCK_SIZE) { + clear(); + return false; + } + static const char OLE_SIGN[] = {(char)0xD0, (char)0xCF, (char)0x11, (char)0xE0, (char)0xA1, (char)0xB1, (char)0x1A, (char)0xE1, 0}; + if (std::strncmp(oleBuf, OLE_SIGN, 8) != 0) { + clear(); + return false; + } + mySectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x1e); //offset for value of big sector size + myShortSectorSize = 1 << OleUtil::getU2Bytes(oleBuf, 0x20); //offset for value of small sector size + + if (readDIFAT(oleBuf) && readBBD(oleBuf) && readSBD(oleBuf) && readProperties(oleBuf) && readAllEntries()) { + return true; + } + clear(); + return false; +} + +bool OleStorage::readDIFAT(char *oleBuf) { + int difatBlock = OleUtil::get4Bytes(oleBuf, 0x44); //address for first difat sector + int difatSectorNumbers = OleUtil::get4Bytes(oleBuf, 0x48); //numbers of additional difat records + + //436 of difat records are stored in header, by offset 0x4c + for (unsigned int i = 0; i < 436; i += 4) { + myDIFAT.push_back(OleUtil::get4Bytes(oleBuf + 0x4c, i)); + } + + //for files > 6.78 mb we need read additional DIFAT fields + for (int i = 0; difatBlock > 0 && i < difatSectorNumbers; ++i) { + ZLLogger::Instance().println("DocPlugin", "Read additional data for DIFAT"); + char buffer[mySectorSize]; + myInputStream->seek(BBD_BLOCK_SIZE + difatBlock * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error read DIFAT!"); + return false; + } + for (unsigned int j = 0; j < (mySectorSize - 4); j += 4) { + myDIFAT.push_back(OleUtil::get4Bytes(buffer, j)); + } + difatBlock = OleUtil::get4Bytes(buffer, mySectorSize - 4); //next DIFAT block is pointed at the end of the sector + } + + //removing unusable DIFAT links + //0xFFFFFFFF means "free section" + while (!myDIFAT.empty() && myDIFAT.back() == (int)0xFFFFFFFF) { + myDIFAT.pop_back(); + } + return true; +} + +bool OleStorage::readBBD(char *oleBuf) { + char buffer[mySectorSize]; + unsigned int bbdNumberBlocks = OleUtil::getU4Bytes(oleBuf, 0x2c); //number of big blocks + + if (myDIFAT.size() < bbdNumberBlocks) { + //TODO maybe add check on myDIFAT == bbdNumberBlocks + ZLLogger::Instance().println("DocPlugin", "Wrong number of FAT blocks value"); + return false; + } + + for (unsigned int i = 0; i < bbdNumberBlocks; ++i) { + int bbdSector = myDIFAT.at(i); + if (bbdSector >= (int)(myStreamSize / mySectorSize) || bbdSector < 0) { + ZLLogger::Instance().println("DocPlugin", "Bad BBD entry!"); + return false; + } + myInputStream->seek(BBD_BLOCK_SIZE + bbdSector * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error during reading BBD!"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 4) { + myBBD.push_back(OleUtil::get4Bytes(buffer, j)); + } + } + return true; +} + +bool OleStorage::readSBD(char *oleBuf) { + int sbdCur = OleUtil::get4Bytes(oleBuf, 0x3c); //address of first small sector + int sbdCount = OleUtil::get4Bytes(oleBuf, 0x40); //count of small sectors + + if (sbdCur <= 0) { + ZLLogger::Instance().println("DocPlugin", "There's no SBD, don't read it"); + return true; + } + + char buffer[mySectorSize]; + for (int i = 0; i < sbdCount; ++i) { + if (i != 0) { + if (sbdCur < 0 || (unsigned int)sbdCur >= myBBD.size()) { + ZLLogger::Instance().println("DocPlugin", "error during parsing SBD"); + return false; + } + sbdCur = myBBD.at(sbdCur); + } + if (sbdCur <= 0) { + break; + } + myInputStream->seek(BBD_BLOCK_SIZE + sbdCur * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "reading error during parsing SBD"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 4) { + mySBD.push_back(OleUtil::get4Bytes(buffer, j)); + } + + } + return true; +} + +bool OleStorage::readProperties(char *oleBuf) { + int propCur = OleUtil::get4Bytes(oleBuf, 0x30); //offset for address of sector with first property + if (propCur < 0) { + ZLLogger::Instance().println("DocPlugin", "Wrong first directory sector location"); + return false; + } + + char buffer[mySectorSize]; + do { + myInputStream->seek(BBD_BLOCK_SIZE + propCur * mySectorSize, true); + if (myInputStream->read(buffer, mySectorSize) != mySectorSize) { + ZLLogger::Instance().println("DocPlugin", "Error during reading properties"); + return false; + } + for (unsigned int j = 0; j < mySectorSize; j += 128) { + myProperties.push_back(std::string(buffer + j, 128)); + } + if (propCur < 0 || (std::size_t)propCur >= myBBD.size()) { + break; + } + propCur = myBBD.at(propCur); + } while (propCur >= 0 && propCur < (int)(myStreamSize / mySectorSize)); + return true; +} + +bool OleStorage::readAllEntries() { + int propCount = myProperties.size(); + for (int i = 0; i < propCount; ++i) { + OleEntry entry; + bool result = readOleEntry(i, entry); + if (!result) { + break; + } + if (entry.type == OleEntry::ROOT_DIR) { + myRootEntryIndex = i; + } + myEntries.push_back(entry); + } + if (myRootEntryIndex < 0) { + return false; + } + return true; +} + +bool OleStorage::readOleEntry(int propNumber, OleEntry &e) { + static const std::string ROOT_ENTRY = "Root Entry"; + + std::string property = myProperties.at(propNumber); + + char oleType = property.at(0x42); //offset for Ole Type + if (oleType != 1 && oleType != 2 && oleType != 3 && oleType != 5) { + ZLLogger::Instance().println("DocPlugin", "entry -- not right ole type"); + return false; + } + + e.type = (OleEntry::Type)oleType; + + int nameLength = OleUtil::getU2Bytes(property.c_str(), 0x40); //offset for value entry's name length + e.name.clear(); + e.name.reserve(33); //max size of entry name + + if ((unsigned int)nameLength >= property.size()) { + return false; + } + for (int i = 0; i < nameLength; i+=2) { + char c = property.at(i); + if (c != 0) { + e.name += c; + } + } + + e.length = OleUtil::getU4Bytes(property.c_str(), 0x78); //offset for entry's length value + e.isBigBlock = e.length >= 0x1000 || e.name == ROOT_ENTRY; + + // Read sector chain + if (property.size() < 0x74 + 4) { + ZLLogger::Instance().println("DocPlugin", "problems with reading ole entry"); + return false; + } + int chainCur = OleUtil::get4Bytes(property.c_str(), 0x74); //offset for start block of entry + if (chainCur >= 0 && (chainCur <= (int)(myStreamSize / (e.isBigBlock ? mySectorSize : myShortSectorSize)))) { + //filling blocks with chains + do { + e.blocks.push_back((unsigned int)chainCur); + if (e.isBigBlock && (std::size_t)chainCur < myBBD.size()) { + chainCur = myBBD.at(chainCur); + } else if (!mySBD.empty() && (std::size_t)chainCur < mySBD.size()) { + chainCur = mySBD.at(chainCur); + } else { + chainCur = -1; + } + } while (chainCur > 0 && + chainCur < (int)(e.isBigBlock ? myBBD.size() : mySBD.size()) && + e.blocks.size() <= e.length / (e.isBigBlock ? mySectorSize : myShortSectorSize)); + } + e.length = std::min(e.length, (unsigned int)((e.isBigBlock ? mySectorSize : myShortSectorSize) * e.blocks.size())); + return true; +} + +bool OleStorage::countFileOffsetOfBlock(const OleEntry &e, unsigned int blockNumber, unsigned int &result) const { + //TODO maybe better syntax can be used? + if (e.blocks.size() <= (std::size_t)blockNumber) { + ZLLogger::Instance().println("DocPlugin", "countFileOffsetOfBlock can't be done, blockNumber is invalid"); + return false; + } + if (e.isBigBlock) { + result = BBD_BLOCK_SIZE + e.blocks.at(blockNumber) * mySectorSize; + } else { + unsigned int sbdPerSector = mySectorSize / myShortSectorSize; + unsigned int sbdSectorNumber = e.blocks.at(blockNumber) / sbdPerSector; + unsigned int sbdSectorMod = e.blocks.at(blockNumber) % sbdPerSector; + if (myEntries.at(myRootEntryIndex).blocks.size() <= (std::size_t)sbdSectorNumber) { + ZLLogger::Instance().println("DocPlugin", "countFileOffsetOfBlock can't be done, invalid sbd data"); + return false; + } + result = BBD_BLOCK_SIZE + myEntries.at(myRootEntryIndex).blocks.at(sbdSectorNumber) * mySectorSize + sbdSectorMod * myShortSectorSize; + } + return true; +} + +bool OleStorage::getEntryByName(std::string name, OleEntry &returnEntry) const { + //TODO fix the workaround for duplicates streams: now it takes a stream with max length + unsigned int maxLength = 0; + for (std::size_t i = 0; i < myEntries.size(); ++i) { + const OleEntry &entry = myEntries.at(i); + if (entry.name == name && entry.length >= maxLength) { + returnEntry = entry; + maxLength = entry.length; + } + } + return maxLength > 0; +} + + diff --git a/reader/src/formats/doc/OleStorage.h b/reader/src/formats/doc/OleStorage.h new file mode 100644 index 0000000..584ee94 --- /dev/null +++ b/reader/src/formats/doc/OleStorage.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTORAGE_H__ +#define __OLESTORAGE_H__ + +#include <algorithm> +#include <vector> +#include <string> + +#include <ZLInputStream.h> + +struct OleEntry { + enum Type { + DIR = 1, + STREAM = 2, + ROOT_DIR = 5, + LOCK_BYTES =3 + }; + + typedef std::vector<unsigned int> Blocks; + + std::string name; + unsigned int length; + Type type; + Blocks blocks; + bool isBigBlock; +}; + +class OleStorage { + +public: + static const std::size_t BBD_BLOCK_SIZE; + +public: + OleStorage(); + bool init(shared_ptr<ZLInputStream>, std::size_t streamSize); + void clear(); + const std::vector<OleEntry> &getEntries() const; + bool getEntryByName(std::string name, OleEntry &entry) const; + + unsigned int getSectorSize() const; + unsigned int getShortSectorSize() const; + +public: //TODO make private + bool countFileOffsetOfBlock(const OleEntry &e, unsigned int blockNumber, unsigned int &result) const; + +private: + bool readDIFAT(char *oleBuf); + bool readBBD(char *oleBuf); + bool readSBD(char *oleBuf); + bool readProperties(char *oleBuf); + + bool readAllEntries(); + bool readOleEntry(int propNumber, OleEntry &entry); + +private: + + shared_ptr<ZLInputStream> myInputStream; + unsigned int mySectorSize, myShortSectorSize; + + std::size_t myStreamSize; + std::vector<int> myDIFAT; //double-indirect file allocation table + std::vector<int> myBBD; //Big Block Depot + std::vector<int> mySBD; //Small Block Depot + std::vector<std::string> myProperties; + std::vector<OleEntry> myEntries; + int myRootEntryIndex; + +}; + +inline const std::vector<OleEntry> &OleStorage::getEntries() const { return myEntries; } +inline unsigned int OleStorage::getSectorSize() const { return mySectorSize; } +inline unsigned int OleStorage::getShortSectorSize() const { return myShortSectorSize; } + +#endif /* __OLESTORAGE_H__ */ diff --git a/reader/src/formats/doc/OleStream.cpp b/reader/src/formats/doc/OleStream.cpp new file mode 100644 index 0000000..8de1cc4 --- /dev/null +++ b/reader/src/formats/doc/OleStream.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleStream.h" +#include "OleUtil.h" + +OleStream::OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream) : + myStorage(storage), + myOleEntry(oleEntry), + myBaseStream(stream) { + myOleOffset = 0; +} + + +bool OleStream::open() { + if (myOleEntry.type != OleEntry::STREAM) { + return false; + } + return true; +} + +std::size_t OleStream::read(char *buffer, std::size_t maxSize) { + std::size_t length = maxSize; + std::size_t readedBytes = 0; + std::size_t bytesLeftInCurBlock; + unsigned int newFileOffset; + + unsigned int curBlockNumber, modBlock; + std::size_t toReadBlocks, toReadBytes; + + if (myOleOffset + length > myOleEntry.length) { + length = myOleEntry.length - myOleOffset; + } + + std::size_t sectorSize = (std::size_t)(myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + + curBlockNumber = myOleOffset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return 0; + } + modBlock = myOleOffset % sectorSize; + bytesLeftInCurBlock = sectorSize - modBlock; + if (bytesLeftInCurBlock < length) { + toReadBlocks = (length - bytesLeftInCurBlock) / sectorSize; + toReadBytes = (length - bytesLeftInCurBlock) % sectorSize; + } else { + toReadBlocks = toReadBytes = 0; + } + + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return 0; + } + newFileOffset += modBlock; + + myBaseStream->seek(newFileOffset, true); + + readedBytes = myBaseStream->read(buffer, std::min(length, bytesLeftInCurBlock)); + for (std::size_t i = 0; i < toReadBlocks; ++i) { + if (++curBlockNumber >= myOleEntry.blocks.size()) { + break; + } + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return readedBytes; + } + myBaseStream->seek(newFileOffset, true); + readedBytes += myBaseStream->read(buffer + readedBytes, std::min(length - readedBytes, sectorSize)); + } + if (toReadBytes > 0 && ++curBlockNumber < myOleEntry.blocks.size()) { + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return readedBytes; + } + myBaseStream->seek(newFileOffset, true); + readedBytes += myBaseStream->read(buffer + readedBytes, toReadBytes); + } + myOleOffset += readedBytes; + return readedBytes; +} + +bool OleStream::eof() const { + return (myOleOffset >= myOleEntry.length); +} + + +void OleStream::close() { +} + +bool OleStream::seek(unsigned int offset, bool absoluteOffset) { + unsigned int newOleOffset = 0; + unsigned int newFileOffset; + + if (absoluteOffset) { + newOleOffset = offset; + } else { + newOleOffset = myOleOffset + offset; + } + + newOleOffset = std::min(newOleOffset, myOleEntry.length); + + unsigned int sectorSize = (myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int blockNumber = newOleOffset / sectorSize; + if (blockNumber >= myOleEntry.blocks.size()) { + return false; + } + + unsigned int modBlock = newOleOffset % sectorSize; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, blockNumber, newFileOffset)) { + return false; + } + newFileOffset += modBlock; + myBaseStream->seek(newFileOffset, true); + myOleOffset = newOleOffset; + return true; +} + +std::size_t OleStream::offset() { + return myOleOffset; +} + +ZLFileImage::Blocks OleStream::getBlockPieceInfoList(unsigned int offset, unsigned int size) const { + ZLFileImage::Blocks list; + unsigned int sectorSize = (myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int curBlockNumber = offset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return list; + } + unsigned int modBlock = offset % sectorSize; + unsigned int startFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, startFileOffset)) { + return ZLFileImage::Blocks(); + } + startFileOffset += modBlock; + + unsigned int bytesLeftInCurBlock = sectorSize - modBlock; + unsigned int toReadBlocks = 0, toReadBytes = 0; + if (bytesLeftInCurBlock < size) { + toReadBlocks = (size - bytesLeftInCurBlock) / sectorSize; + toReadBytes = (size - bytesLeftInCurBlock) % sectorSize; + } + + unsigned int readedBytes = std::min(size, bytesLeftInCurBlock); + list.push_back(ZLFileImage::Block(startFileOffset, readedBytes)); + + for (unsigned int i = 0; i < toReadBlocks; ++i) { + if (++curBlockNumber >= myOleEntry.blocks.size()) { + break; + } + unsigned int newFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return ZLFileImage::Blocks(); + } + unsigned int readbytes = std::min(size - readedBytes, sectorSize); + list.push_back(ZLFileImage::Block(newFileOffset, readbytes)); + readedBytes += readbytes; + } + if (toReadBytes > 0 && ++curBlockNumber < myOleEntry.blocks.size()) { + unsigned int newFileOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, newFileOffset)) { + return ZLFileImage::Blocks(); + } + unsigned int readbytes = toReadBytes; + list.push_back(ZLFileImage::Block(newFileOffset, readbytes)); + readedBytes += readbytes; + } + + return concatBlocks(list); +} + +ZLFileImage::Blocks OleStream::concatBlocks(const ZLFileImage::Blocks &blocks) { + if (blocks.size() < 2) { + return blocks; + } + ZLFileImage::Blocks optList; + ZLFileImage::Block curBlock = blocks.at(0); + unsigned int nextOffset = curBlock.offset + curBlock.size; + for (std::size_t i = 1; i < blocks.size(); ++i) { + ZLFileImage::Block b = blocks.at(i); + if (b.offset == nextOffset) { + curBlock.size += b.size; + nextOffset += b.size; + } else { + optList.push_back(curBlock); + curBlock = b; + nextOffset = curBlock.offset + curBlock.size; + } + } + optList.push_back(curBlock); + return optList; +} + +std::size_t OleStream::fileOffset() { + //TODO maybe remove this method, it doesn't use at this time + std::size_t sectorSize = (std::size_t)(myOleEntry.isBigBlock ? myStorage->getSectorSize() : myStorage->getShortSectorSize()); + unsigned int curBlockNumber = myOleOffset / sectorSize; + if (curBlockNumber >= myOleEntry.blocks.size()) { + return 0; + } + unsigned int modBlock = myOleOffset % sectorSize; + unsigned int curOffset = 0; + if (!myStorage->countFileOffsetOfBlock(myOleEntry, curBlockNumber, curOffset)) { + return 0; //TODO maybe remove -1? + } + return curOffset + modBlock; +} diff --git a/reader/src/formats/doc/OleStream.h b/reader/src/formats/doc/OleStream.h new file mode 100644 index 0000000..861c7cb --- /dev/null +++ b/reader/src/formats/doc/OleStream.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAM_H__ +#define __OLESTREAM_H__ + +#include <ZLFileImage.h> + +#include "OleStorage.h" + +class OleStream { + +public: + OleStream(shared_ptr<OleStorage> storage, OleEntry oleEntry, shared_ptr<ZLInputStream> stream); + +public: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + +public: + bool seek(unsigned int offset, bool absoluteOffset); + std::size_t offset(); + +public: + ZLFileImage::Blocks getBlockPieceInfoList(unsigned int offset, unsigned int size) const; + static ZLFileImage::Blocks concatBlocks(const ZLFileImage::Blocks &blocks); + std::size_t fileOffset(); + +public: + bool eof() const; + +protected: + shared_ptr<OleStorage> myStorage; + + OleEntry myOleEntry; + shared_ptr<ZLInputStream> myBaseStream; + + unsigned int myOleOffset; +}; + +#endif /* __OLESTREAM_H__ */ diff --git a/reader/src/formats/doc/OleStreamParser.cpp b/reader/src/formats/doc/OleStreamParser.cpp new file mode 100644 index 0000000..0a9c62d --- /dev/null +++ b/reader/src/formats/doc/OleStreamParser.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +//#include <cctype> +//#include <cstring> + +#include <ZLLogger.h> + +#include "OleMainStream.h" +#include "OleUtil.h" +#include "OleStreamParser.h" + +//word's control chars: +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_FOOTNOTE_MARK = 0x0002; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_TABLE_SEPARATOR = 0x0007; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_HORIZONTAL_TAB = 0x0009; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_HARD_LINEBREAK = 0x000b; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_PAGE_BREAK = 0x000c; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_END_OF_PARAGRAPH = 0x000d; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_MINUS = 0x001e; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_SOFT_HYPHEN = 0x001f; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_START_FIELD = 0x0013; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_SEPARATOR_FIELD = 0x0014; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_END_FIELD = 0x0015; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::WORD_ZERO_WIDTH_UNBREAKABLE_SPACE = 0xfeff; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::INLINE_IMAGE = 0x0001; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::FLOAT_IMAGE = 0x0008; + +//unicode values: +const ZLUnicodeUtil::Ucs2Char OleStreamParser::NULL_SYMBOL = 0x0; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::FILE_SEPARATOR = 0x1c; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::LINE_FEED = 0x000a; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::SOFT_HYPHEN = 0xad; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::SPACE = 0x20; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::MINUS = 0x2D; +const ZLUnicodeUtil::Ucs2Char OleStreamParser::VERTICAL_LINE = 0x7C; + +OleStreamParser::OleStreamParser() { + myCurBufferPosition = 0; + + myCurCharPos = 0; + myNextStyleInfoIndex = 0; + myNextCharInfoIndex = 0; + myNextBookmarkIndex = 0; + myNextInlineImageInfoIndex = 0; + myNextFloatImageInfoIndex = 0; +} + +bool OleStreamParser::readStream(OleMainStream &oleMainStream) { + ZLUnicodeUtil::Ucs2Char ucs2char; + bool tabMode = false; + while (getUcs2Char(oleMainStream, ucs2char)) { + if (tabMode) { + tabMode = false; + if (ucs2char == WORD_TABLE_SEPARATOR) { + handleTableEndRow(); + continue; + } else { + handleTableSeparator(); + } + } + + if (ucs2char < 32) { + switch (ucs2char) { + case NULL_SYMBOL: + break; + case WORD_HARD_LINEBREAK: + handleHardLinebreak(); + break; + case WORD_END_OF_PARAGRAPH: + case WORD_PAGE_BREAK: + handleParagraphEnd(); + break; + case WORD_TABLE_SEPARATOR: + tabMode = true; + break; + case WORD_FOOTNOTE_MARK: + handleFootNoteMark(); + break; + case WORD_START_FIELD: + handleStartField(); + break; + case WORD_SEPARATOR_FIELD: + handleSeparatorField(); + break; + case WORD_END_FIELD: + handleEndField(); + break; + case INLINE_IMAGE: + case FLOAT_IMAGE: + break; + default: + handleOtherControlChar(ucs2char); + break; + } + } else if (ucs2char == WORD_ZERO_WIDTH_UNBREAKABLE_SPACE) { + continue; //skip + } else { + handleChar(ucs2char); + } + } + + return true; +} + +bool OleStreamParser::getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char) { + while (myCurBufferPosition >= myBuffer.size()) { + myBuffer.clear(); + myCurBufferPosition = 0; + if (!readNextPiece(stream)) { + return false; + } + } + ucs2char = myBuffer.at(myCurBufferPosition++); + processStyles(stream); + + switch (ucs2char) { + case INLINE_IMAGE: + processInlineImage(stream); + break; + case FLOAT_IMAGE: + processFloatImage(stream); + break; + } + ++myCurCharPos; + return true; +} + +void OleStreamParser::processInlineImage(OleMainStream &stream) { + const OleMainStream::InlineImageInfoList &imageInfoList = stream.getInlineImageInfoList(); + if (imageInfoList.empty()) { + return; + } + //seek to curCharPos, because not all entries are real pictures + while(myNextInlineImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextInlineImageInfoIndex).first < myCurCharPos) { + ++myNextInlineImageInfoIndex; + } + while (myNextInlineImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextInlineImageInfoIndex).first == myCurCharPos) { + OleMainStream::InlineImageInfo info = imageInfoList.at(myNextInlineImageInfoIndex).second; + ZLFileImage::Blocks list = stream.getInlineImage(info.DataPosition); + if (!list.empty()) { + handleImage(list); + } + ++myNextInlineImageInfoIndex; + } +} + +void OleStreamParser::processFloatImage(OleMainStream &stream) { + const OleMainStream::FloatImageInfoList &imageInfoList = stream.getFloatImageInfoList(); + if (imageInfoList.empty()) { + return; + } + //seek to curCharPos, because not all entries are real pictures + while(myNextFloatImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextFloatImageInfoIndex).first < myCurCharPos) { + ++myNextFloatImageInfoIndex; + } + while (myNextFloatImageInfoIndex < imageInfoList.size() && imageInfoList.at(myNextFloatImageInfoIndex).first == myCurCharPos) { + OleMainStream::FloatImageInfo info = imageInfoList.at(myNextFloatImageInfoIndex).second; + ZLFileImage::Blocks list = stream.getFloatImage(info.ShapeId); + if (!list.empty()) { + handleImage(list); + } + ++myNextFloatImageInfoIndex; + } +} + +void OleStreamParser::processStyles(OleMainStream &stream) { + const OleMainStream::StyleInfoList &styleInfoList = stream.getStyleInfoList(); + if (!styleInfoList.empty()) { + while (myNextStyleInfoIndex < styleInfoList.size() && styleInfoList.at(myNextStyleInfoIndex).first == myCurCharPos) { + OleMainStream::Style info = styleInfoList.at(myNextStyleInfoIndex).second; + handleParagraphStyle(info); + ++myNextStyleInfoIndex; + } + } + + const OleMainStream::CharInfoList &charInfoList = stream.getCharInfoList(); + if (!charInfoList.empty()) { + while (myNextCharInfoIndex < charInfoList.size() && charInfoList.at(myNextCharInfoIndex).first == myCurCharPos) { + OleMainStream::CharInfo info = charInfoList.at(myNextCharInfoIndex).second; + handleFontStyle(info.FontStyle); + ++myNextCharInfoIndex; + } + } + + const OleMainStream::BookmarksList &bookmarksList = stream.getBookmarks(); + if (!bookmarksList.empty()) { + while (myNextBookmarkIndex < bookmarksList.size() && bookmarksList.at(myNextBookmarkIndex).CharPosition == myCurCharPos) { + OleMainStream::Bookmark bookmark = bookmarksList.at(myNextBookmarkIndex); + handleBookmark(bookmark.Name); + ++myNextBookmarkIndex; + } + } +} diff --git a/reader/src/formats/doc/OleStreamParser.h b/reader/src/formats/doc/OleStreamParser.h new file mode 100644 index 0000000..1adec2f --- /dev/null +++ b/reader/src/formats/doc/OleStreamParser.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAMPARSER_H__ +#define __OLESTREAMPARSER_H__ + +#include <ZLUnicodeUtil.h> + +#include "OleMainStream.h" +#include "OleStreamReader.h" + +class OleStreamParser : public OleStreamReader { + +public: + //word's control chars: + static const ZLUnicodeUtil::Ucs2Char WORD_FOOTNOTE_MARK; + static const ZLUnicodeUtil::Ucs2Char WORD_TABLE_SEPARATOR; + static const ZLUnicodeUtil::Ucs2Char WORD_HORIZONTAL_TAB; + static const ZLUnicodeUtil::Ucs2Char WORD_HARD_LINEBREAK; + static const ZLUnicodeUtil::Ucs2Char WORD_PAGE_BREAK; + static const ZLUnicodeUtil::Ucs2Char WORD_END_OF_PARAGRAPH; + static const ZLUnicodeUtil::Ucs2Char WORD_MINUS; + static const ZLUnicodeUtil::Ucs2Char WORD_SOFT_HYPHEN; + static const ZLUnicodeUtil::Ucs2Char WORD_START_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_SEPARATOR_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_END_FIELD; + static const ZLUnicodeUtil::Ucs2Char WORD_ZERO_WIDTH_UNBREAKABLE_SPACE; + static const ZLUnicodeUtil::Ucs2Char INLINE_IMAGE; + static const ZLUnicodeUtil::Ucs2Char FLOAT_IMAGE; + + //unicode values: + static const ZLUnicodeUtil::Ucs2Char NULL_SYMBOL; + static const ZLUnicodeUtil::Ucs2Char FILE_SEPARATOR; + static const ZLUnicodeUtil::Ucs2Char LINE_FEED; + static const ZLUnicodeUtil::Ucs2Char SOFT_HYPHEN; + static const ZLUnicodeUtil::Ucs2Char SPACE; + static const ZLUnicodeUtil::Ucs2Char MINUS; + static const ZLUnicodeUtil::Ucs2Char VERTICAL_LINE; + +public: + OleStreamParser(); + +private: + bool readStream(OleMainStream &stream); + +protected: + virtual void handleChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0; + virtual void handleHardLinebreak() = 0; + virtual void handleParagraphEnd() = 0; + virtual void handlePageBreak() = 0; + virtual void handleTableSeparator() = 0; + virtual void handleTableEndRow() = 0; + virtual void handleFootNoteMark() = 0; + virtual void handleStartField() = 0; + virtual void handleSeparatorField() = 0; + virtual void handleEndField() = 0; + virtual void handleImage(const ZLFileImage::Blocks &blocks) = 0; + virtual void handleOtherControlChar(ZLUnicodeUtil::Ucs2Char ucs2char) = 0; + + virtual void handleFontStyle(unsigned int fontStyle) = 0; + virtual void handleParagraphStyle(const OleMainStream::Style &styleInfo) = 0; + virtual void handleBookmark(const std::string &name) = 0; + +private: + bool getUcs2Char(OleMainStream &stream, ZLUnicodeUtil::Ucs2Char &ucs2char); + void processInlineImage(OleMainStream &stream); + void processFloatImage(OleMainStream &stream); + void processStyles(OleMainStream &stream); + +private: +protected: + ZLUnicodeUtil::Ucs2String myBuffer; +private: + std::size_t myCurBufferPosition; + + unsigned int myCurCharPos; + + std::size_t myNextStyleInfoIndex; + std::size_t myNextCharInfoIndex; + std::size_t myNextBookmarkIndex; + std::size_t myNextInlineImageInfoIndex; + std::size_t myNextFloatImageInfoIndex; +}; + +#endif /* __OLESTREAMPARSER_H__ */ diff --git a/reader/src/formats/doc/OleStreamReader.cpp b/reader/src/formats/doc/OleStreamReader.cpp new file mode 100644 index 0000000..224489a --- /dev/null +++ b/reader/src/formats/doc/OleStreamReader.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLLogger.h> + +#include "OleMainStream.h" +#include "OleUtil.h" +#include "OleStreamReader.h" + +OleStreamReader::OleStreamReader() : myNextPieceNumber(0) { +} + +bool OleStreamReader::readDocument(shared_ptr<ZLInputStream> inputStream, bool doReadFormattingData) { + static const std::string WORD_DOCUMENT = "WordDocument"; + + shared_ptr<OleStorage> storage = new OleStorage; + + if (!storage->init(inputStream, inputStream->sizeOfOpened())) { + ZLLogger::Instance().println("DocPlugin", "Broken OLE file"); + return false; + } + + OleEntry wordDocumentEntry; + if (!storage->getEntryByName(WORD_DOCUMENT, wordDocumentEntry)) { + return false; + } + + OleMainStream oleStream(storage, wordDocumentEntry, inputStream); + if (!oleStream.open(doReadFormattingData)) { + ZLLogger::Instance().println("DocPlugin", "Cannot open OleMainStream"); + return false; + } + return readStream(oleStream); +} + +bool OleStreamReader::readNextPiece(OleMainStream &stream) { + const OleMainStream::Pieces &pieces = stream.getPieces(); + if (myNextPieceNumber >= pieces.size()) { + return false; + } + const OleMainStream::Piece &piece = pieces.at(myNextPieceNumber); + + if (piece.Type == OleMainStream::Piece::PIECE_FOOTNOTE) { + footnotesStartHandler(); + } else if (piece.Type == OleMainStream::Piece::PIECE_OTHER) { + return false; + } + + if (!stream.seek(piece.Offset, true)) { + //TODO maybe in that case we should take next piece? + return false; + } + char *textBuffer = new char[piece.Length]; + std::size_t readBytes = stream.read(textBuffer, piece.Length); + if (readBytes != (std::size_t)piece.Length) { + ZLLogger::Instance().println("DocPlugin", "not all bytes have been read from piece"); + } + + if (!piece.IsANSI) { + for (std::size_t i = 0; i < readBytes; i += 2) { + ucs2SymbolHandler(OleUtil::getU2Bytes(textBuffer, i)); + } + } else { + ansiDataHandler(textBuffer, readBytes); + } + ++myNextPieceNumber; + delete[] textBuffer; + + return true; +} diff --git a/reader/src/formats/doc/OleStreamReader.h b/reader/src/formats/doc/OleStreamReader.h new file mode 100644 index 0000000..2d2a0ae --- /dev/null +++ b/reader/src/formats/doc/OleStreamReader.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLESTREAMREADER_H__ +#define __OLESTREAMREADER_H__ + +#include <ZLUnicodeUtil.h> + +#include "OleMainStream.h" + +class OleStreamReader { + +public: + OleStreamReader(); + bool readDocument(shared_ptr<ZLInputStream> stream, bool doReadFormattingData); + +protected: + virtual bool readStream(OleMainStream &stream) = 0; + + bool readNextPiece(OleMainStream &stream); + + virtual void ansiDataHandler(const char *buffer, std::size_t len) = 0; + virtual void ucs2SymbolHandler(ZLUnicodeUtil::Ucs2Char symbol) = 0; + virtual void footnotesStartHandler() = 0; + +private: + std::size_t myNextPieceNumber; +}; + +#endif /* __OLESTREAMREADER_H__ */ diff --git a/reader/src/formats/doc/OleUtil.cpp b/reader/src/formats/doc/OleUtil.cpp new file mode 100644 index 0000000..2e8f685 --- /dev/null +++ b/reader/src/formats/doc/OleUtil.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "OleUtil.h" + +int OleUtil::get4Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (int)buf[offset] + | ((int)buf[offset+1] << 8) + | ((int)buf[offset+2] << 16) + | ((int)buf[offset+3] << 24); +} + +unsigned int OleUtil::getU4Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (unsigned int)buf[offset] + | ((unsigned int)buf[offset+1] << 8) + | ((unsigned int)buf[offset+2] << 16) + | ((unsigned int)buf[offset+3] << 24); +} + +unsigned int OleUtil::getU2Bytes(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return + (unsigned int)buf[offset] + | ((unsigned int)buf[offset+1] << 8); +} + +unsigned int OleUtil::getU1Byte(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return (unsigned int)buf[offset]; +} + +int OleUtil::get1Byte(const char *buffer, unsigned int offset) { + const unsigned char *buf = (const unsigned char*)buffer; + return (int)buf[offset]; +} + + + diff --git a/reader/src/formats/doc/OleUtil.h b/reader/src/formats/doc/OleUtil.h new file mode 100644 index 0000000..531c769 --- /dev/null +++ b/reader/src/formats/doc/OleUtil.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OLEUTIL_H__ +#define __OLEUTIL_H__ + +class OleUtil { +public: + static int get4Bytes(const char *buffer, unsigned int offset); + static unsigned int getU4Bytes(const char *buffer, unsigned int offset); + static unsigned int getU2Bytes(const char *buffer, unsigned int offset); + static unsigned int getU1Byte(const char *buffer, unsigned int offset); + static int get1Byte(const char *buffer, unsigned int offset); +}; + +#endif /* __OLEUTIL_H__ */ diff --git a/reader/src/formats/docbook/DocBookBookReader.cpp b/reader/src/formats/docbook/DocBookBookReader.cpp new file mode 100644 index 0000000..eada90c --- /dev/null +++ b/reader/src/formats/docbook/DocBookBookReader.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "DocBookBookReader.h" + +#include "../../bookmodel/BookModel.h" +#include "../../model/Paragraph.h" + +DocBookBookReader::DocBookBookReader(BookModel &model) : BookReader(model) { + setMainTextModel(); + + myReadText = false; +} + +void DocBookBookReader::characterDataHandler(const char *text, std::size_t len) { + addDataToBuffer(text, len); +} + +void DocBookBookReader::startElementHandler(int tag, const char **) { + switch (tag) { + case _SECT1: + myReadText = true; + pushKind(REGULAR); + beginContentsParagraph(); + break; + case _PARA: + if (myReadText) { + beginParagraph(); + } + break; + case _TITLE: + enterTitle(); + pushKind(SECTION_TITLE); + if (myReadText) { + beginParagraph(); + } + break; + case _EMPHASIS: + addControl(EMPHASIS, true); + break; + case _CITETITLE: + addControl(CITE, true); + break; + case _ULINK: + case _EMAIL: + addControl(CODE, true); + break; + case _BLOCKQUOTE: + pushKind(STRONG); + break; + default: + break; + } +} + +void DocBookBookReader::endElementHandler(int tag) { + switch (tag) { + case _SECT1: + myReadText = false; + popKind(); + endContentsParagraph(); + insertEndOfSectionParagraph(); + break; + case _PARA: + endParagraph(); + break; + case _TITLE: + endParagraph(); + popKind(); + endContentsParagraph(); + exitTitle(); + break; + case _EMPHASIS: + addControl(EMPHASIS, false); + break; + case _CITETITLE: + addControl(CITE, false); + break; + case _ULINK: + case _EMAIL: + addControl(CODE, false); + break; + case _BLOCKQUOTE: + popKind(); + break; + default: + break; + } +} + +void DocBookBookReader::readBook(shared_ptr<ZLInputStream> stream) { + readDocument(stream); +} diff --git a/reader/src/formats/docbook/DocBookBookReader.h b/reader/src/formats/docbook/DocBookBookReader.h new file mode 100644 index 0000000..c226184 --- /dev/null +++ b/reader/src/formats/docbook/DocBookBookReader.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKBOOKREADER_H__ +#define __DOCBOOKBOOKREADER_H__ + +#include "DocBookReader.h" +#include "../../bookmodel/BookReader.h" + +class BookModel; + +class DocBookBookReader : public BookReader, public DocBookReader { + +public: + DocBookBookReader(BookModel &model); + ~DocBookBookReader(); + void readBook(shared_ptr<ZLInputStream> stream); + + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + bool myReadText; +}; + +inline DocBookBookReader::~DocBookBookReader() {} + +#endif /* __DOCBOOKBOOKREADER_H__ */ diff --git a/reader/src/formats/docbook/DocBookDescriptionReader.cpp b/reader/src/formats/docbook/DocBookDescriptionReader.cpp new file mode 100644 index 0000000..bcd4ae4 --- /dev/null +++ b/reader/src/formats/docbook/DocBookDescriptionReader.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> +#include <ZLUnicodeUtil.h> + +#include "DocBookDescriptionReader.h" + +#include "../../library/Book.h" +#include "../../library/Author.h" + +DocBookDescriptionReader::DocBookDescriptionReader(Book &book) : myBook(book) { + myReadTitle = false; + myReadAuthor = false; + for (int i = 0; i < 3; ++i) { + myReadAuthorName[i] = false; + } + myBook.setLanguage("en"); + myDepth = 0; +} + +void DocBookDescriptionReader::characterDataHandler(const char *text, std::size_t len) { + if (myReadTitle) { + myBook.setTitle(myBook.title() + std::string(text, len)); + } else { + for (int i = 0; i < 3; ++i) { + if (myReadAuthorName[i]) { + myAuthorNames[i].append(text, len); + break; + } + } + } +} + +void DocBookDescriptionReader::startElementHandler(int tag, const char **) { + ++myDepth; + switch (tag) { + case _SECT1: + myReturnCode = true; + myDoBreak = true; + break; + case _TITLE: + if (myDepth == 2) { + myReadTitle = true; + } + break; + case _AUTHOR: + if (myDepth == 3) { + myReadAuthor = true; + } + break; + case _FIRSTNAME: + if (myReadAuthor) { + myReadAuthorName[0] = true; + } + break; + case _OTHERNAME: + if (myReadAuthor) { + myReadAuthorName[1] = true; + } + break; + case _SURNAME: + if (myReadAuthor) { + myReadAuthorName[2] = true; + } + break; + default: + break; + } +} + +void DocBookDescriptionReader::endElementHandler(int tag) { + --myDepth; + switch (tag) { + case _TITLE: + myReadTitle = false; + break; + case _AUTHOR: { + ZLUnicodeUtil::utf8Trim(myAuthorNames[0]); + ZLUnicodeUtil::utf8Trim(myAuthorNames[1]); + ZLUnicodeUtil::utf8Trim(myAuthorNames[2]); + std::string fullName = myAuthorNames[0]; + if (!fullName.empty() && !myAuthorNames[1].empty()) { + fullName += ' '; + } + fullName += myAuthorNames[1]; + if (!fullName.empty() && !myAuthorNames[2].empty()) { + fullName += ' '; + } + fullName += myAuthorNames[2]; + shared_ptr<Author> author = Author::create(fullName, myAuthorNames[2]); + if (!author.isNull()) { + myBook.authors().add( author ); + } + } + myAuthorNames[0].erase(); + myAuthorNames[1].erase(); + myAuthorNames[2].erase(); + myReadAuthor = false; + break; + case _FIRSTNAME: + myReadAuthorName[0] = false; + break; + case _OTHERNAME: + myReadAuthorName[1] = false; + break; + case _SURNAME: + myReadAuthorName[2] = false; + break; + default: + break; + } +} + +bool DocBookDescriptionReader::readMetaInfo(shared_ptr<ZLInputStream> stream) { + bool code = readDocument(stream); + if (myBook.authors().empty()) { + myBook.authors().push_back( new Author() ); + } + return code; +} diff --git a/reader/src/formats/docbook/DocBookDescriptionReader.h b/reader/src/formats/docbook/DocBookDescriptionReader.h new file mode 100644 index 0000000..d9f4aa3 --- /dev/null +++ b/reader/src/formats/docbook/DocBookDescriptionReader.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKDESCRIPTIONREADER_H__ +#define __DOCBOOKDESCRIPTIONREADER_H__ + +#include <string> + +#include "DocBookReader.h" + +class Book; + +class DocBookDescriptionReader : public DocBookReader { + +public: + DocBookDescriptionReader(Book &book); + ~DocBookDescriptionReader(); + bool readMetaInfo(shared_ptr<ZLInputStream> stream); + + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + Book &myBook; + + bool myReturnCode; + + bool myReadTitle; + bool myReadAuthor; + bool myReadAuthorName[3]; + + std::string myAuthorNames[3]; + + int myDepth; +}; + +inline DocBookDescriptionReader::~DocBookDescriptionReader() {} + +#endif /* __DOCBOOKDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/docbook/DocBookPlugin.cpp b/reader/src/formats/docbook/DocBookPlugin.cpp new file mode 100644 index 0000000..1b890a6 --- /dev/null +++ b/reader/src/formats/docbook/DocBookPlugin.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "DocBookPlugin.h" +#include "DocBookDescriptionReader.h" +#include "DocBookBookReader.h" +#include "../../library/Book.h" + +bool DocBookPlugin::acceptsFile(const std::string &extension) const { + return extension == "xml"; +} + +bool DocBookPlugin::readMetaInfo(Book &book) const { + return DocBookDescriptionReader(book).readMetaInfo(ZLFile(path).inputStream()); +} + +bool DocBookPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +bool DocBookPlugin::readModel(BookModel &model) const { + return DocBookBookReader(model).readDocument(ZLFile(book.fileName()).inputStream()); +} diff --git a/reader/src/formats/docbook/DocBookPlugin.h b/reader/src/formats/docbook/DocBookPlugin.h new file mode 100644 index 0000000..324b2be --- /dev/null +++ b/reader/src/formats/docbook/DocBookPlugin.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKPLUGIN_H__ +#define __DOCBOOKPLUGIN_H__ + +#include "../FormatPlugin.h" + +class DocBookPlugin : public FormatPlugin { + +public: + DocBookPlugin(); + ~DocBookPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const std::string &extension) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +inline DocBookPlugin::DocBookPlugin() {} +inline DocBookPlugin::~DocBookPlugin() {} +inline bool DocBookPlugin::providesMetaInfo() const { return true; } + +#endif /* __DOCBOOKPLUGIN_H__ */ diff --git a/reader/src/formats/docbook/DocBookReader.cpp b/reader/src/formats/docbook/DocBookReader.cpp new file mode 100644 index 0000000..73c17d1 --- /dev/null +++ b/reader/src/formats/docbook/DocBookReader.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLStringUtil.h> +#include <ZLFile.h> +#include <ZLDir.h> + +#include "DocBookReader.h" + +static const DocBookReader::Tag TAGS[] = { + {"article", DocBookReader::_ARTICLE}, + {"title", DocBookReader::_TITLE}, + {"articleinfo", DocBookReader::_ARTICLEINFO}, + {"author", DocBookReader::_AUTHOR}, + {"firstname", DocBookReader::_FIRSTNAME}, + {"othername", DocBookReader::_OTHERNAME}, + {"surname", DocBookReader::_SURNAME}, + {"affiliation", DocBookReader::_AFFILIATION}, + {"orgname", DocBookReader::_ORGNAME}, + {"ulink", DocBookReader::_ULINK}, + {"address", DocBookReader::_ADDRESS}, + {"email", DocBookReader::_EMAIL}, + {"pubdate", DocBookReader::_PUBDATE}, + {"releaseinfo", DocBookReader::_RELEASEINFO}, + {"copyright", DocBookReader::_COPYRIGHT}, + {"year", DocBookReader::_YEAR}, + {"holder", DocBookReader::_HOLDER}, + {"legalnotice", DocBookReader::_LEGALNOTICE}, + {"para", DocBookReader::_PARA}, + {"revhistory", DocBookReader::_REVHISTORY}, + {"revision", DocBookReader::_REVISION}, + {"revnumber", DocBookReader::_REVNUMBER}, + {"date", DocBookReader::_DATE}, + {"authorinitials", DocBookReader::_AUTHORINITIALS}, + {"revremark", DocBookReader::_REVREMARK}, + {"abstract", DocBookReader::_ABSTRACT}, + {"sect1", DocBookReader::_SECT1}, + {"emphasis", DocBookReader::_EMPHASIS}, + {"blockquote", DocBookReader::_BLOCKQUOTE}, + {"citetitle", DocBookReader::_CITETITLE}, + {"link", DocBookReader::_LINK}, + {"foreignphrase", DocBookReader::_FOREIGNPHRASE}, + {"part", DocBookReader::_PART}, + {"preface", DocBookReader::_PREFACE}, + {"chapter", DocBookReader::_CHAPTER}, + {0, DocBookReader::_UNKNOWN} +}; + +const DocBookReader::Tag *DocBookReader::tags() const { + return TAGS; +} + +const std::vector<std::string> &DocBookReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("docbook"); +} diff --git a/reader/src/formats/docbook/DocBookReader.h b/reader/src/formats/docbook/DocBookReader.h new file mode 100644 index 0000000..a18f358 --- /dev/null +++ b/reader/src/formats/docbook/DocBookReader.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCBOOKREADER_H__ +#define __DOCBOOKREADER_H__ + +#include <ZLXMLReader.h> + +class DocBookReader : public ZLXMLReader { + +public: + static std::string DTDDirectory; + +public: + struct Tag { + const char *tagName; + int tagCode; + }; + +public: +//protected: + enum TagCode { + _ARTICLE, + _TITLE, + _ARTICLEINFO, + _AUTHOR, + _FIRSTNAME, + _OTHERNAME, + _SURNAME, + _AFFILIATION, + _ORGNAME, + _ULINK, + _ADDRESS, + _EMAIL, + _PUBDATE, + _RELEASEINFO, + _COPYRIGHT, + _YEAR, + _HOLDER, + _LEGALNOTICE, + _PARA, + _REVHISTORY, + _REVISION, + _REVNUMBER, + _DATE, + _AUTHORINITIALS, + _REVREMARK, + _ABSTRACT, + _SECT1, + _EMPHASIS, + _BLOCKQUOTE, + _CITETITLE, + _LINK, + _FOREIGNPHRASE, + _FIRSTTERM, + _FILENAME, + _ITEMIZEDLIST, + _LISTITEM, + _PART, + _PREFACE, + _CHAPTER, + _UNKNOWN + }; + +protected: + DocBookReader(); + +public: + ~DocBookReader(); + const Tag *tags() const; + +protected: + const std::vector<std::string> &externalDTDs() const; +}; + +inline DocBookReader::DocBookReader() {} +inline DocBookReader::~DocBookReader() {} + +#endif /* __DOCBOOKREADER_H__ */ diff --git a/reader/src/formats/dummy/DummyBookReader.cpp b/reader/src/formats/dummy/DummyBookReader.cpp new file mode 100644 index 0000000..2684ebf --- /dev/null +++ b/reader/src/formats/dummy/DummyBookReader.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "DummyBookReader.h" +#include "../../bookmodel/BookModel.h" + +DummyBookReader::DummyBookReader(BookModel &model) : myModelReader(model) { +} + +/* +void DummyBookReader::characterDataHandler(const char *text, std::size_t len) { +} + +void DummyBookReader::startElementHandler(int tag, const char **xmlattributes) { +} + +void DummyBookReader::endElementHandler(int tag) { +} +*/ + +bool DummyBookReader::readBook(shared_ptr<ZLInputStream> stream) { + //return readDocument(stream); + return true; +} diff --git a/reader/src/formats/dummy/DummyBookReader.h b/reader/src/formats/dummy/DummyBookReader.h new file mode 100644 index 0000000..ba6bcf8 --- /dev/null +++ b/reader/src/formats/dummy/DummyBookReader.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DUMMYBOOKREADER_H__ +#define __DUMMYBOOKREADER_H__ + +#include "../../bookmodel/BookReader.h" + +class DummyBookReader { + +public: + DummyBookReader(BookModel &model); + ~DummyBookReader(); + bool readBook(shared_ptr<ZLInputStream> stream); + + /* + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + */ + +private: + BookReader myModelReader; +}; + +inline DummyBookReader::~DummyBookReader() {} + +#endif /* __DUMMYBOOKREADER_H__ */ diff --git a/reader/src/formats/dummy/DummyMetaInfoReader.cpp b/reader/src/formats/dummy/DummyMetaInfoReader.cpp new file mode 100644 index 0000000..5dd13c5 --- /dev/null +++ b/reader/src/formats/dummy/DummyMetaInfoReader.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "DummyMetaInfoReader.h" + +DummyMetaInfoReader::DummyMetaInfoReader(Book &book) : myBook(book) { +} + +/* +void DummyMetaInfoReader::characterDataHandler(const char *text, std::size_t len) { +} + +void DummyMetaInfoReader::startElementHandler(int tag, const char **) { +} + +void DummyMetaInfoReader::endElementHandler(int tag) { +} +*/ + +bool DummyMetaInfoReader::readMetaInfo(shared_ptr<ZLInputStream> stream) { + return false; +} diff --git a/reader/src/formats/dummy/DummyMetaInfoReader.h b/reader/src/formats/dummy/DummyMetaInfoReader.h new file mode 100644 index 0000000..818d996 --- /dev/null +++ b/reader/src/formats/dummy/DummyMetaInfoReader.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DUMMYMETAINFOREADER_H__ +#define __DUMMYMETAINFOREADER_H__ + +#include <string> + +class Book; + +class DummyMetaInfoReader { + +public: + DummyMetaInfoReader(Book &book); + ~DummyMetaInfoReader(); + bool readMetaInfo(shared_ptr<ZLInputStream> stream); + + /* + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + */ + +private: + Book &myBook; +}; + +inline DummyMetaInfoReader::~DummyMetaInfoReader() {} + +#endif /* __DUMMYMETAINFOREADER_H__ */ diff --git a/reader/src/formats/dummy/DummyPlugin.cpp b/reader/src/formats/dummy/DummyPlugin.cpp new file mode 100644 index 0000000..bfe0662 --- /dev/null +++ b/reader/src/formats/dummy/DummyPlugin.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "DummyPlugin.h" +#include "DummyMetaInfoReader.h" +#include "DummyBookReader.h" +#include "../../library/Book.h" + +DummyPlugin::DummyPlugin() { +} + +DummyPlugin::~DummyPlugin() { +} + +bool DummyPlugin::providesMetaInfo() const { + return true; +} + +bool DummyPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "dummy"; +} + +bool DummyPlugin::readMetaInfo(Book &book) const { + return DummyMetaInfoReader(book).readMetaInfo(ZLFile(path).inputStream()); +} + +bool DummyPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +bool DummyPlugin::readModel(BookModel &model) const { + return DummyBookReader(model).readBook(ZLFile(book.fileName()).inputStream()); +} + +shared_ptr<const ZLImage> DummyPlugin::coverImage(const ZLFile &file) const { + return DummyCoverReader(file).readCover(); +} diff --git a/reader/src/formats/dummy/DummyPlugin.h b/reader/src/formats/dummy/DummyPlugin.h new file mode 100644 index 0000000..073449c --- /dev/null +++ b/reader/src/formats/dummy/DummyPlugin.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DUMMYPLUGIN_H__ +#define __DUMMYPLUGIN_H__ + +#include "../FormatPlugin.h" + +class DummyPlugin : public FormatPlugin { + +public: + DummyPlugin(); + ~DummyPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + shared_ptr<const ZLImage> coverImage(const ZLFile &file) const; +}; + +#endif /* __DUMMYPLUGIN_H__ */ diff --git a/reader/src/formats/dummy/createPlugin.sh b/reader/src/formats/dummy/createPlugin.sh new file mode 100755 index 0000000..aacc3d4 --- /dev/null +++ b/reader/src/formats/dummy/createPlugin.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +if [ $# != 3 ]; then + echo "usage: $0 <short_format_name> <camel_cased_format_name> <upper_cased_format_name>"; + exit 0; +fi; + +if mkdir ../$1; then + for file in Dummy*.h Dummy*.cpp; do + sed "s/Dummy/$2/g" $file | sed "s/DUMMY/$3/g" > ../$1/`echo $file | sed "s/Dummy/$2/"`; + done +fi; diff --git a/reader/src/formats/fb2/FB2BookReader.cpp b/reader/src/formats/fb2/FB2BookReader.cpp new file mode 100644 index 0000000..f689343 --- /dev/null +++ b/reader/src/formats/fb2/FB2BookReader.cpp @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cstring> + +#include <ZLInputStream.h> +#include <ZLStringUtil.h> +#include <ZLFileImage.h> + +#include <ZLTextParagraph.h> + +#include "FB2BookReader.h" +#include "../../library/Book.h" +#include "../../bookmodel/BookModel.h" + +FB2BookReader::FB2BookReader(BookModel &model) : myModelReader(model) { + myInsideCoverpage = false; + myParagraphsBeforeBodyNumber = (std::size_t)-1; + myInsidePoem = false; + mySectionDepth = 0; + myBodyCounter = 0; + myReadMainText = false; + myCurrentImageStart = -1; + mySectionStarted = false; + myInsideTitle = false; + myCurrentContentType = ZLMimeType::EMPTY; +} + +void FB2BookReader::characterDataHandler(const char *text, std::size_t len) { + if ((len > 0) && (!myCurrentImageId.empty() || myModelReader.paragraphIsOpen())) { + std::string str(text, len); + if (!myCurrentImageId.empty()) { + if (myCurrentImageStart == -1) { + myCurrentImageStart = getCurrentPosition(); + } + } else { + myModelReader.addData(str); + if (myInsideTitle) { + myModelReader.addContentsData(str); + } + } + } +} + +bool FB2BookReader::processNamespaces() const { + return true; +} + +void FB2BookReader::startElementHandler(int tag, const char **xmlattributes) { + const char *id = attributeValue(xmlattributes, "id"); + if (id != 0 && tag != _BINARY) { + if (!myReadMainText) { + myModelReader.setFootnoteTextModel(id); + } + myModelReader.addHyperlinkLabel(id); + } + switch (tag) { + case _P: + if (mySectionStarted) { + mySectionStarted = false; + } else if (myInsideTitle) { + static const std::string SPACE = " "; + myModelReader.addContentsData(SPACE); + } + myModelReader.beginParagraph(); + break; + case _V: + myModelReader.pushKind(VERSE); + myModelReader.beginParagraph(); + break; + case _SUBTITLE: + myModelReader.pushKind(SUBTITLE); + myModelReader.beginParagraph(); + break; + case _TEXT_AUTHOR: + myModelReader.pushKind(AUTHOR); + myModelReader.beginParagraph(); + break; + case _DATE: + myModelReader.pushKind(DATEKIND); + myModelReader.beginParagraph(); + break; + case _CITE: + myModelReader.pushKind(CITE); + break; + case _SECTION: + if (myReadMainText) { + myModelReader.insertEndOfSectionParagraph(); + ++mySectionDepth; + myModelReader.beginContentsParagraph(); + mySectionStarted = true; + } + break; + case _TITLE: + if (myInsidePoem) { + myModelReader.pushKind(POEM_TITLE); + } else if (mySectionDepth == 0) { + myModelReader.insertEndOfSectionParagraph(); + myModelReader.pushKind(TITLE); + } else { + myModelReader.pushKind(SECTION_TITLE); + myModelReader.enterTitle(); + myInsideTitle = true; + } + break; + case _POEM: + myInsidePoem = true; + break; + case _STANZA: + myModelReader.pushKind(STANZA); + myModelReader.beginParagraph(ZLTextParagraph::BEFORE_SKIP_PARAGRAPH); + myModelReader.endParagraph(); + break; + case _EPIGRAPH: + myModelReader.pushKind(EPIGRAPH); + break; + case _ANNOTATION: + if (myBodyCounter == 0) { + myModelReader.setMainTextModel(); + } + myModelReader.pushKind(ANNOTATION); + break; + case _COVERPAGE: + if (myBodyCounter == 0) { + myInsideCoverpage = true; + myModelReader.setMainTextModel(); + } + break; + case _SUB: + myModelReader.addControl(SUB, true); + break; + case _SUP: + myModelReader.addControl(SUP, true); + break; + case _CODE: + myModelReader.addControl(CODE, true); + break; + case _STRIKETHROUGH: + myModelReader.addControl(STRIKETHROUGH, true); + break; + case _STRONG: + myModelReader.addControl(STRONG, true); + break; + case _EMPHASIS: + myModelReader.addControl(EMPHASIS, true); + break; + case _A: + { + const char *ref = attributeValue(xmlattributes, myHrefPredicate); + if (ref != 0) { + if (ref[0] == '#') { + const char *type = attributeValue(xmlattributes, "type"); + static const std::string NOTE = "note"; + if ((type != 0) && (NOTE == type)) { + myHyperlinkType = FOOTNOTE; + } else { + myHyperlinkType = INTERNAL_HYPERLINK; + } + ++ref; + } else { + myHyperlinkType = EXTERNAL_HYPERLINK; + } + myModelReader.addHyperlinkControl(myHyperlinkType, ref); + } else { + myHyperlinkType = FOOTNOTE; + myModelReader.addControl(myHyperlinkType, true); + } + break; + } + case _IMAGE: + { + const char *ref = attributeValue(xmlattributes, myHrefPredicate); + const char *vOffset = attributeValue(xmlattributes, "voffset"); + char offset = vOffset != 0 ? std::atoi(vOffset) : 0; + if (ref != 0 && *ref == '#') { + ++ref; + const bool isCoverImage = + myParagraphsBeforeBodyNumber == + myModelReader.model().bookTextModel()->paragraphsNumber(); + if (myCoverImageReference != ref || !isCoverImage) { + myModelReader.addImageReference(ref, offset); + } + if (myInsideCoverpage) { + myCoverImageReference = ref; + } + } + break; + } + case _BINARY: + { + const char *contentType = attributeValue(xmlattributes, "content-type"); + if (contentType != 0) { + shared_ptr<ZLMimeType> contentMimeType = ZLMimeType::get(contentType); + if ((!contentMimeType.isNull()) && (id != 0) && (ZLMimeType::TEXT_XML != contentMimeType)) { + myCurrentContentType = contentMimeType; + myCurrentImageId.assign(id); + } + } + break; + } + case _EMPTY_LINE: + myModelReader.beginParagraph(ZLTextParagraph::EMPTY_LINE_PARAGRAPH); + myModelReader.endParagraph(); + break; + case _BODY: + ++myBodyCounter; + myParagraphsBeforeBodyNumber = myModelReader.model().bookTextModel()->paragraphsNumber(); + if ((myBodyCounter == 1) || (attributeValue(xmlattributes, "name") == 0)) { + myModelReader.setMainTextModel(); + myReadMainText = true; + } + myModelReader.pushKind(REGULAR); + break; + default: + break; + } +} + +void FB2BookReader::endElementHandler(int tag) { + switch (tag) { + case _P: + myModelReader.endParagraph(); + break; + case _V: + case _SUBTITLE: + case _TEXT_AUTHOR: + case _DATE: + myModelReader.popKind(); + myModelReader.endParagraph(); + break; + case _CITE: + myModelReader.popKind(); + break; + case _SECTION: + if (myReadMainText) { + myModelReader.endContentsParagraph(); + --mySectionDepth; + mySectionStarted = false; + } else { + myModelReader.unsetTextModel(); + } + break; + case _TITLE: + myModelReader.exitTitle(); + myModelReader.popKind(); + myInsideTitle = false; + break; + case _POEM: + myInsidePoem = false; + break; + case _STANZA: + myModelReader.beginParagraph(ZLTextParagraph::AFTER_SKIP_PARAGRAPH); + myModelReader.endParagraph(); + myModelReader.popKind(); + break; + case _EPIGRAPH: + myModelReader.popKind(); + break; + case _ANNOTATION: + myModelReader.popKind(); + if (myBodyCounter == 0) { + myModelReader.insertEndOfSectionParagraph(); + myModelReader.unsetTextModel(); + } + break; + case _COVERPAGE: + if (myBodyCounter == 0) { + myInsideCoverpage = false; + myModelReader.insertEndOfSectionParagraph(); + myModelReader.unsetTextModel(); + } + break; + case _SUB: + myModelReader.addControl(SUB, false); + break; + case _SUP: + myModelReader.addControl(SUP, false); + break; + case _CODE: + myModelReader.addControl(CODE, false); + break; + case _STRIKETHROUGH: + myModelReader.addControl(STRIKETHROUGH, false); + break; + case _STRONG: + myModelReader.addControl(STRONG, false); + break; + case _EMPHASIS: + myModelReader.addControl(EMPHASIS, false); + break; + case _A: + myModelReader.addControl(myHyperlinkType, false); + break; + case _BINARY: + if (!myCurrentImageId.empty() && myCurrentImageStart != -1) { + myModelReader.addImage(myCurrentImageId, new ZLFileImage( + ZLFile(myModelReader.model().book()->file().path(), myCurrentContentType), + myCurrentImageStart, + getCurrentPosition() - myCurrentImageStart, + ZLFileImage::ENCODING_BASE64 + )); + } + myCurrentImageId.clear(); + myCurrentContentType = ZLMimeType::EMPTY; + myCurrentImageStart = -1; + break; + case _BODY: + myModelReader.popKind(); + myModelReader.unsetTextModel(); + myReadMainText = false; + break; + default: + break; + } +} + +bool FB2BookReader::readBook() { + return readDocument(myModelReader.model().book()->file()); +} diff --git a/reader/src/formats/fb2/FB2BookReader.h b/reader/src/formats/fb2/FB2BookReader.h new file mode 100644 index 0000000..b9d22d1 --- /dev/null +++ b/reader/src/formats/fb2/FB2BookReader.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2BOOKREADER_H__ +#define __FB2BOOKREADER_H__ + +#include <ZLMimeType.h> + +#include "FB2Reader.h" +#include "../../bookmodel/BookReader.h" + +class BookModel; + +class FB2BookReader : public FB2Reader { + +public: + FB2BookReader(BookModel &model); + bool readBook(); + + bool processNamespaces() const; + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + int mySectionDepth; + int myBodyCounter; + bool myReadMainText; + bool myInsideCoverpage; + std::size_t myParagraphsBeforeBodyNumber; + std::string myCoverImageReference; + bool myInsidePoem; + BookReader myModelReader; + + int myCurrentImageStart; + std::string myCurrentImageId; + shared_ptr<ZLMimeType> myCurrentContentType; + + bool mySectionStarted; + bool myInsideTitle; + + FBTextKind myHyperlinkType; +}; + +#endif /* __FB2BOOKREADER_H__ */ diff --git a/reader/src/formats/fb2/FB2CoverReader.cpp b/reader/src/formats/fb2/FB2CoverReader.cpp new file mode 100644 index 0000000..cc84ac2 --- /dev/null +++ b/reader/src/formats/fb2/FB2CoverReader.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFileImage.h> + +#include "FB2CoverReader.h" + +#include "../../library/Book.h" + +FB2CoverReader::FB2CoverReader(const ZLFile &file) : myFile(file) { +} + +shared_ptr<const ZLImage> FB2CoverReader::readCover() { + myReadCoverPage = false; + myLookForImage = false; + myImageId.erase(); + myImageStart = -1; + + readDocument(myFile); + + return myImage; +} + +bool FB2CoverReader::processNamespaces() const { + return true; +} + +void FB2CoverReader::startElementHandler(int tag, const char **attributes) { + switch (tag) { + case _COVERPAGE: + myReadCoverPage = true; + break; + case _IMAGE: + if (myReadCoverPage) { + const char *ref = attributeValue(attributes, myHrefPredicate); + if (ref != 0 && *ref == '#' && *(ref + 1) != '\0') { + myImageId = ref + 1; + } + } + break; + case _BINARY: + { + const char *id = attributeValue(attributes, "id"); + const char *contentType = attributeValue(attributes, "content-type"); + if (id != 0 && contentType != 0 && myImageId == id) { + myLookForImage = true; + } + } + } +} + +void FB2CoverReader::endElementHandler(int tag) { + switch (tag) { + case _COVERPAGE: + myReadCoverPage = false; + break; + case _DESCRIPTION: + if (myImageId.empty()) { + interrupt(); + } + break; + case _BINARY: + if (!myImageId.empty() && myImageStart >= 0) { + myImage = new ZLFileImage(myFile, myImageStart, getCurrentPosition() - myImageStart, ZLFileImage::ENCODING_BASE64); + interrupt(); + } + break; + } +} + +void FB2CoverReader::characterDataHandler(const char *text, std::size_t len) { + if (len > 0 && myLookForImage) { + myImageStart = getCurrentPosition(); + myLookForImage = false; + } +} diff --git a/reader/src/formats/fb2/FB2CoverReader.h b/reader/src/formats/fb2/FB2CoverReader.h new file mode 100644 index 0000000..6807aa9 --- /dev/null +++ b/reader/src/formats/fb2/FB2CoverReader.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2COVERREADER_H__ +#define __FB2COVERREADER_H__ + +#include <ZLFile.h> +#include <ZLImage.h> + +#include "FB2Reader.h" + +class FB2CoverReader : public FB2Reader { + +public: + FB2CoverReader(const ZLFile &file); + shared_ptr<const ZLImage> readCover(); + +private: + bool processNamespaces() const; + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + const ZLFile myFile; + bool myReadCoverPage; + bool myLookForImage; + std::string myImageId; + int myImageStart; + shared_ptr<const ZLImage> myImage; +}; + +#endif /* __FB2COVERREADER_H__ */ diff --git a/reader/src/formats/fb2/FB2MetaInfoReader.cpp b/reader/src/formats/fb2/FB2MetaInfoReader.cpp new file mode 100644 index 0000000..3d596ac --- /dev/null +++ b/reader/src/formats/fb2/FB2MetaInfoReader.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> + +#include <ZLInputStream.h> +#include <ZLUnicodeUtil.h> + +#include "FB2MetaInfoReader.h" +#include "FB2TagManager.h" + +#include "../../library/Book.h" + +FB2MetaInfoReader::FB2MetaInfoReader(Book &book) : myBook(book) { + myBook.removeAllAuthors(); + myBook.setTitle(std::string()); + myBook.setLanguage(std::string()); + myBook.removeAllTags(); +} + +void FB2MetaInfoReader::characterDataHandler(const char *text, std::size_t len) { + switch (myReadState) { + case READ_TITLE: + myBuffer.append(text, len); + break; + case READ_LANGUAGE: + myBuffer.append(text, len); + break; + case READ_AUTHOR_NAME_0: + myAuthorNames[0].append(text, len); + break; + case READ_AUTHOR_NAME_1: + myAuthorNames[1].append(text, len); + break; + case READ_AUTHOR_NAME_2: + myAuthorNames[2].append(text, len); + break; + case READ_GENRE: + myBuffer.append(text, len); + break; + default: + break; + } +} + +void FB2MetaInfoReader::startElementHandler(int tag, const char **attributes) { + switch (tag) { + case _BODY: + myReturnCode = true; + interrupt(); + break; + case _TITLE_INFO: + myReadState = READ_SOMETHING; + break; + case _BOOK_TITLE: + if (myReadState == READ_SOMETHING) { + myReadState = READ_TITLE; + } + break; + case _GENRE: + if (myReadState == READ_SOMETHING) { + myReadState = READ_GENRE; + } + break; + case _AUTHOR: + if (myReadState == READ_SOMETHING) { + myReadState = READ_AUTHOR; + } + break; + case _LANG: + if (myReadState == READ_SOMETHING) { + myReadState = READ_LANGUAGE; + } + break; + case _FIRST_NAME: + if (myReadState == READ_AUTHOR) { + myReadState = READ_AUTHOR_NAME_0; + } + break; + case _MIDDLE_NAME: + if (myReadState == READ_AUTHOR) { + myReadState = READ_AUTHOR_NAME_1; + } + break; + case _LAST_NAME: + if (myReadState == READ_AUTHOR) { + myReadState = READ_AUTHOR_NAME_2; + } + break; + case _SEQUENCE: + if (myReadState == READ_SOMETHING) { + const char *name = attributeValue(attributes, "name"); + if (name != 0) { + std::string seriesTitle = name; + ZLUnicodeUtil::utf8Trim(seriesTitle); + const char *number = attributeValue(attributes, "number"); + myBook.setSeries(seriesTitle, number != 0 ? std::string(number) : std::string()); + } + } + break; + default: + break; + } +} + +void FB2MetaInfoReader::endElementHandler(int tag) { + switch (tag) { + case _TITLE_INFO: + myReadState = READ_NOTHING; + break; + case _BOOK_TITLE: + if (myReadState == READ_TITLE) { + myBook.setTitle(myBuffer); + myBuffer.erase(); + myReadState = READ_SOMETHING; + } + break; + case _GENRE: + if (myReadState == READ_GENRE) { + ZLUnicodeUtil::utf8Trim(myBuffer); + if (!myBuffer.empty()) { + const std::vector<std::string> &tags = + FB2TagManager::Instance().humanReadableTags(myBuffer); + if (!tags.empty()) { + for (std::vector<std::string>::const_iterator it = tags.begin(); it != tags.end(); ++it) { + myBook.addTag(*it); + } + } else { + myBook.addTag(myBuffer); + } + myBuffer.erase(); + } + myReadState = READ_SOMETHING; + } + break; + case _AUTHOR: + if (myReadState == READ_AUTHOR) { + ZLUnicodeUtil::utf8Trim(myAuthorNames[0]); + ZLUnicodeUtil::utf8Trim(myAuthorNames[1]); + ZLUnicodeUtil::utf8Trim(myAuthorNames[2]); + std::string fullName = myAuthorNames[0]; + if (!fullName.empty() && !myAuthorNames[1].empty()) { + fullName += ' '; + } + fullName += myAuthorNames[1]; + if (!fullName.empty() && !myAuthorNames[2].empty()) { + fullName += ' '; + } + fullName += myAuthorNames[2]; + myBook.addAuthor(fullName, myAuthorNames[2]); + myAuthorNames[0].erase(); + myAuthorNames[1].erase(); + myAuthorNames[2].erase(); + myReadState = READ_SOMETHING; + } + break; + case _LANG: + if (myReadState == READ_LANGUAGE) { + myBook.setLanguage(myBuffer); + myBuffer.erase(); + myReadState = READ_SOMETHING; + } + break; + case _FIRST_NAME: + if (myReadState == READ_AUTHOR_NAME_0) { + myReadState = READ_AUTHOR; + } + break; + case _MIDDLE_NAME: + if (myReadState == READ_AUTHOR_NAME_1) { + myReadState = READ_AUTHOR; + } + break; + case _LAST_NAME: + if (myReadState == READ_AUTHOR_NAME_2) { + myReadState = READ_AUTHOR; + } + break; + default: + break; + } +} + +bool FB2MetaInfoReader::readMetaInfo() { + myReadState = READ_NOTHING; + for (int i = 0; i < 3; ++i) { + myAuthorNames[i].erase(); + } + return readDocument(myBook.file()); +} diff --git a/reader/src/formats/fb2/FB2MetaInfoReader.h b/reader/src/formats/fb2/FB2MetaInfoReader.h new file mode 100644 index 0000000..cc09909 --- /dev/null +++ b/reader/src/formats/fb2/FB2MetaInfoReader.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2METAINFOREADER_H__ +#define __FB2METAINFOREADER_H__ + +#include <string> + +#include "FB2Reader.h" + +class Book; + +class FB2MetaInfoReader : public FB2Reader { + +public: + FB2MetaInfoReader(Book &book); + bool readMetaInfo(); + + void startElementHandler(int tag, const char **attributes); + void endElementHandler(int tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + Book &myBook; + + bool myReturnCode; + + enum { + READ_NOTHING, + READ_SOMETHING, + READ_TITLE, + READ_AUTHOR, + READ_AUTHOR_NAME_0, + READ_AUTHOR_NAME_1, + READ_AUTHOR_NAME_2, + READ_LANGUAGE, + READ_GENRE + } myReadState; + + std::string myAuthorNames[3]; + std::string myBuffer; +}; + +#endif /* __FB2METAINFOREADER_H__ */ diff --git a/reader/src/formats/fb2/FB2Plugin.cpp b/reader/src/formats/fb2/FB2Plugin.cpp new file mode 100644 index 0000000..f65ddcb --- /dev/null +++ b/reader/src/formats/fb2/FB2Plugin.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLImage.h> + +#include "FB2Plugin.h" +#include "FB2MetaInfoReader.h" +#include "FB2BookReader.h" +#include "FB2CoverReader.h" + +#include "../../database/booksdb/BooksDBUtil.h" + +bool FB2Plugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "fb2"; +} + +bool FB2Plugin::readMetaInfo(Book &book) const { + return FB2MetaInfoReader(book).readMetaInfo(); +} + +bool FB2Plugin::readModel(BookModel &model) const { + return FB2BookReader(model).readBook(); +} + +shared_ptr<const ZLImage> FB2Plugin::coverImage(const ZLFile &file) const { + return FB2CoverReader(file).readCover(); +} +bool FB2Plugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} diff --git a/reader/src/formats/fb2/FB2Plugin.h b/reader/src/formats/fb2/FB2Plugin.h new file mode 100644 index 0000000..d96558d --- /dev/null +++ b/reader/src/formats/fb2/FB2Plugin.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2PLUGIN_H__ +#define __FB2PLUGIN_H__ + +#include "../FormatPlugin.h" + +class FB2Plugin : public FormatPlugin { + +public: + FB2Plugin(); + ~FB2Plugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + shared_ptr<const ZLImage> coverImage(const ZLFile &file) const; +}; + +inline FB2Plugin::FB2Plugin() {} +inline FB2Plugin::~FB2Plugin() {} +inline bool FB2Plugin::providesMetaInfo() const { return true; } + +#endif /* __FB2PLUGIN_H__ */ diff --git a/reader/src/formats/fb2/FB2Reader.cpp b/reader/src/formats/fb2/FB2Reader.cpp new file mode 100644 index 0000000..c8e279c --- /dev/null +++ b/reader/src/formats/fb2/FB2Reader.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLibrary.h> +#include <ZLStringUtil.h> +#include <ZLXMLNamespace.h> + +#include "FB2Reader.h" + +#include "../util/EntityFilesCollector.h" + +FB2Reader::FB2Reader() : myHrefPredicate(ZLXMLNamespace::XLink, "href") { +} + +void FB2Reader::startElementHandler(const char *t, const char **attributes) { + startElementHandler(tag(t), attributes); +} + +void FB2Reader::endElementHandler(const char *t) { + endElementHandler(tag(t)); +} + +static const FB2Reader::Tag TAGS[] = { + {"p", FB2Reader::_P}, + {"subtitle", FB2Reader::_SUBTITLE}, + {"cite", FB2Reader::_CITE}, + {"text-author", FB2Reader::_TEXT_AUTHOR}, + {"date", FB2Reader::_DATE}, + {"section", FB2Reader::_SECTION}, + {"v", FB2Reader::_V}, + {"title", FB2Reader::_TITLE}, + {"poem", FB2Reader::_POEM}, + {"stanza", FB2Reader::_STANZA}, + {"epigraph", FB2Reader::_EPIGRAPH}, + {"annotation", FB2Reader::_ANNOTATION}, + {"sub", FB2Reader::_SUB}, + {"sup", FB2Reader::_SUP}, + {"code", FB2Reader::_CODE}, + {"strikethrough", FB2Reader::_STRIKETHROUGH}, + {"strong", FB2Reader::_STRONG}, + {"emphasis", FB2Reader::_EMPHASIS}, + {"a", FB2Reader::_A}, + {"image", FB2Reader::_IMAGE}, + {"binary", FB2Reader::_BINARY}, + {"description", FB2Reader::_DESCRIPTION}, + {"body", FB2Reader::_BODY}, + {"empty-line", FB2Reader::_EMPTY_LINE}, + {"title-info", FB2Reader::_TITLE_INFO}, + {"book-title", FB2Reader::_BOOK_TITLE}, + {"author", FB2Reader::_AUTHOR}, + {"lang", FB2Reader::_LANG}, + {"first-name", FB2Reader::_FIRST_NAME}, + {"middle-name", FB2Reader::_MIDDLE_NAME}, + {"last-name", FB2Reader::_LAST_NAME}, + {"coverpage", FB2Reader::_COVERPAGE}, + {"sequence", FB2Reader::_SEQUENCE}, + {"genre", FB2Reader::_GENRE}, + {0, FB2Reader::_UNKNOWN} +}; + +int FB2Reader::tag(const char *name) { + for (int i = 0; ; ++i) { + if (TAGS[i].tagName == 0 || std::strcmp(name, TAGS[i].tagName) == 0) { + return TAGS[i].tagCode; + } + } +} + +const std::vector<std::string> &FB2Reader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("fb2"); +} diff --git a/reader/src/formats/fb2/FB2Reader.h b/reader/src/formats/fb2/FB2Reader.h new file mode 100644 index 0000000..8fa8654 --- /dev/null +++ b/reader/src/formats/fb2/FB2Reader.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2READER_H__ +#define __FB2READER_H__ + +#include <ZLXMLReader.h> + +class FB2Reader : public ZLXMLReader { + +public: + struct Tag { + const char *tagName; + int tagCode; + }; + +protected: + virtual int tag(const char *name); + + virtual void startElementHandler(int tag, const char **attributes) = 0; + virtual void endElementHandler(int tag) = 0; + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + + const std::vector<std::string> &externalDTDs() const; + +public: + enum TagCode { + _P, + _SUBTITLE, + _CITE, + _TEXT_AUTHOR, + _DATE, + _SECTION, + _V, + _TITLE, + _POEM, + _STANZA, + _EPIGRAPH, + _ANNOTATION, + _SUB, + _SUP, + _CODE, + _STRIKETHROUGH, + _STRONG, + _EMPHASIS, + _A, + _IMAGE, + _BINARY, + _DESCRIPTION, + _BODY, + _EMPTY_LINE, + _TITLE_INFO, + _BOOK_TITLE, + _AUTHOR, + _LANG, + _FIRST_NAME, + _MIDDLE_NAME, + _LAST_NAME, + _COVERPAGE, + _SEQUENCE, + _GENRE, + _UNKNOWN + }; + +protected: + FB2Reader(); + ~FB2Reader(); + +protected: + const NamespaceAttributeNamePredicate myHrefPredicate; +}; + +inline FB2Reader::~FB2Reader() {} + +#endif /* __FB2READER_H__ */ diff --git a/reader/src/formats/fb2/FB2TagManager.cpp b/reader/src/formats/fb2/FB2TagManager.cpp new file mode 100644 index 0000000..f698ace --- /dev/null +++ b/reader/src/formats/fb2/FB2TagManager.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <vector> + +#include <ZLFile.h> +#include <ZLXMLReader.h> +#include <ZLibrary.h> +#include <ZLUnicodeUtil.h> + +#include "FB2TagManager.h" + +class FB2TagInfoReader : public ZLXMLReader { + +public: + FB2TagInfoReader(std::map<std::string,std::vector<std::string> > &tagMap); + + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + +private: + std::map<std::string,std::vector<std::string> > &myTagMap; + + std::string myCategoryName; + std::string mySubCategoryName; + std::vector<std::string> myGenreIds; + std::string myLanguage; +}; + +FB2TagInfoReader::FB2TagInfoReader(std::map<std::string,std::vector<std::string> > &tagMap) : myTagMap(tagMap) { + myLanguage = ZLibrary::Language(); + if (myLanguage != "ru") { + myLanguage = "en"; + } +} + +static const std::string CATEGORY_NAME_TAG = "root-descr"; +static const std::string SUBCATEGORY_NAME_TAG = "genre-descr"; +static const std::string GENRE_TAG = "genre"; +static const std::string SUBGENRE_TAG = "subgenre"; +static const std::string SUBGENRE_ALT_TAG = "genre-alt"; + +void FB2TagInfoReader::startElementHandler(const char *tag, const char **attributes) { + if ((SUBGENRE_TAG == tag) || (SUBGENRE_ALT_TAG == tag)) { + const char *id = attributeValue(attributes, "value"); + if (id != 0) { + myGenreIds.push_back(id); + } + } else if (CATEGORY_NAME_TAG == tag) { + const char *lang = attributeValue(attributes, "lang"); + if ((lang != 0) && (myLanguage == lang)) { + const char *name = attributeValue(attributes, "genre-title"); + if (name != 0) { + myCategoryName = name; + ZLUnicodeUtil::utf8Trim(myCategoryName); + } + } + } else if (SUBCATEGORY_NAME_TAG == tag) { + const char *lang = attributeValue(attributes, "lang"); + if ((lang != 0) && (myLanguage == lang)) { + const char *name = attributeValue(attributes, "title"); + if (name != 0) { + mySubCategoryName = name; + ZLUnicodeUtil::utf8Trim(mySubCategoryName); + } + } + } +} + +void FB2TagInfoReader::endElementHandler(const char *tag) { + if (GENRE_TAG == tag) { + myCategoryName.erase(); + mySubCategoryName.erase(); + myGenreIds.clear(); + } else if (SUBGENRE_TAG == tag) { + if (!myCategoryName.empty() && !mySubCategoryName.empty()) { + const std::string fullTagName = myCategoryName + '/' + mySubCategoryName; + for (std::vector<std::string>::const_iterator it = myGenreIds.begin(); it != myGenreIds.end(); ++it) { + myTagMap[*it].push_back(fullTagName); + } + } + mySubCategoryName.erase(); + myGenreIds.clear(); + } +} + +FB2TagManager *FB2TagManager::ourInstance = 0; + +const FB2TagManager &FB2TagManager::Instance() { + if (ourInstance == 0) { + ourInstance = new FB2TagManager(); + } + return *ourInstance; +} + +FB2TagManager::FB2TagManager() { + FB2TagInfoReader(myTagMap).readDocument(ZLFile( + ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter + + "formats" + ZLibrary::FileNameDelimiter + "fb2" + + ZLibrary::FileNameDelimiter + "fb2genres.xml" + )); +} + +const std::vector<std::string> &FB2TagManager::humanReadableTags(const std::string &id) const { + static const std::vector<std::string> EMPTY; + std::map<std::string,std::vector<std::string> >::const_iterator it = myTagMap.find(id); + return (it != myTagMap.end()) ? it->second : EMPTY; +} diff --git a/reader/src/formats/fb2/FB2TagManager.h b/reader/src/formats/fb2/FB2TagManager.h new file mode 100644 index 0000000..cfbf076 --- /dev/null +++ b/reader/src/formats/fb2/FB2TagManager.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __FB2TAGMANAGER_H__ +#define __FB2TAGMANAGER_H__ + +#include <string> +#include <map> +#include <vector> + +class FB2TagManager { + +private: + static FB2TagManager *ourInstance; + +public: + static const FB2TagManager &Instance(); + +private: + FB2TagManager(); + +public: + const std::vector<std::string> &humanReadableTags(const std::string &id) const; + +private: + std::map<std::string,std::vector<std::string> > myTagMap; +}; + +#endif /* __FB2TAGMANAGER_H__ */ diff --git a/reader/src/formats/html/HtmlBookReader.cpp b/reader/src/formats/html/HtmlBookReader.cpp new file mode 100644 index 0000000..321913d --- /dev/null +++ b/reader/src/formats/html/HtmlBookReader.cpp @@ -0,0 +1,583 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLStringUtil.h> + +#include "HtmlBookReader.h" +#include "HtmlTagActions.h" +#include "../txt/PlainTextFormat.h" +#include "../util/MiscUtil.h" +#include "../../bookmodel/BookModel.h" +#include "../css/StyleSheetParser.h" + +HtmlTagAction::HtmlTagAction(HtmlBookReader &reader) : myReader(reader) { +} + +HtmlTagAction::~HtmlTagAction() { +} + +void HtmlTagAction::reset() { +} + +DummyHtmlTagAction::DummyHtmlTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void DummyHtmlTagAction::run(const HtmlReader::HtmlTag&) { +} + +HtmlControlTagAction::HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlControlTagAction::run(const HtmlReader::HtmlTag &tag) { + std::vector<FBTextKind> &list = myReader.myKindList; + int index; + for (index = list.size() - 1; index >= 0; --index) { + if (list[index] == myKind) { + break; + } + } + if (tag.Start) { + if (index == -1) { + bookReader().pushKind(myKind); + myReader.myKindList.push_back(myKind); + bookReader().addControl(myKind, true); + } + } else { + if (index >= 0) { + for (int i = list.size() - 1; i >= index; --i) { + bookReader().addControl(list[i], false); + bookReader().popKind(); + } + for (unsigned int j = index + 1; j < list.size(); ++j) { + bookReader().addControl(list[j], true); + bookReader().pushKind(list[j]); + } + list.erase(list.begin() + index); + } + } +} + +HtmlHeaderTagAction::HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind) : HtmlTagAction(reader), myKind(kind) { +} + +void HtmlHeaderTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myIsStarted = false; + if (tag.Start) { + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + if (!bookReader().contentsParagraphIsOpen()) { + bookReader().insertEndOfSectionParagraph(); + bookReader().enterTitle(); + bookReader().beginContentsParagraph(); + } + } + bookReader().pushKind(myKind); + } else { + bookReader().popKind(); + if (myReader.myBuildTableOfContent && !myReader.myIgnoreTitles) { + bookReader().endContentsParagraph(); + bookReader().exitTitle(); + } + } + bookReader().beginParagraph(); +} + +HtmlIgnoreTagAction::HtmlIgnoreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlIgnoreTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + if (myTagNames.find(tag.Name) == myTagNames.end()) { + ++myReader.myIgnoreDataCounter; + myTagNames.insert(tag.Name); + } + } else { + if (myTagNames.find(tag.Name) != myTagNames.end()) { + --myReader.myIgnoreDataCounter; + myTagNames.erase(tag.Name); + } + } +} + +HtmlHrefTagAction::HtmlHrefTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "NAME") { + bookReader().addHyperlinkLabel(tag.Attributes[i].Value); + } else if ((hyperlinkType() == REGULAR) && (tag.Attributes[i].Name == "HREF")) { + std::string value = tag.Attributes[i].Value; + if (!myReader.myFileName.empty() && + (value.length() > myReader.myFileName.length()) && + (value.substr(0, myReader.myFileName.length()) == myReader.myFileName)) { + value = value.substr(myReader.myFileName.length()); + } + if (!value.empty()) { + if (value[0] == '#') { + setHyperlinkType(INTERNAL_HYPERLINK); + bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, value.substr(1)); + } else { + FBTextKind hyperlinkType = MiscUtil::referenceType(value); + if (hyperlinkType != INTERNAL_HYPERLINK) { + setHyperlinkType(hyperlinkType); + bookReader().addHyperlinkControl(hyperlinkType, value); + } + } + } + } + } + } else if (hyperlinkType() != REGULAR) { + bookReader().addControl(hyperlinkType(), false); + setHyperlinkType(REGULAR); + } +} + +void HtmlHrefTagAction::reset() { + setHyperlinkType(REGULAR); +} + +FBTextKind HtmlHrefTagAction::hyperlinkType() const { + return myHyperlinkType; +} + +void HtmlHrefTagAction::setHyperlinkType(FBTextKind hyperlinkType) { + myHyperlinkType = hyperlinkType; +} + +HtmlImageTagAction::HtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "SRC") { + const std::string fileName = MiscUtil::decodeHtmlURL(tag.Attributes[i].Value); + const ZLFile file(myReader.myBaseDirPath + fileName); + if (file.exists()) { + bookReader().addImageReference(fileName); + bookReader().addImage(fileName, new ZLFileImage(file, 0)); + } + break; + } + } + bookReader().beginParagraph(); + } +} + +HtmlBreakTagAction::HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType) : HtmlTagAction(reader), myBreakType(breakType) { +} + +void HtmlBreakTagAction::run(const HtmlReader::HtmlTag &tag) { + if (myReader.myDontBreakParagraph) { + myReader.myDontBreakParagraph = false; + return; + } + + if ((tag.Start && (myBreakType & BREAK_AT_START)) || + (!tag.Start && (myBreakType & BREAK_AT_END))) { + bookReader().endParagraph(); + if (bookReader().isKindStackEmpty()) { + bookReader().pushKind(REGULAR); + } + bookReader().beginParagraph(); + } +} + +HtmlPreTagAction::HtmlPreTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlPreTagAction::run(const HtmlReader::HtmlTag &tag) { + bookReader().endParagraph(); + myReader.myIsPreformatted = tag.Start; + myReader.mySpaceCounter = -1; + myReader.myBreakCounter = 0; + if (myReader.myFormat.breakType() == PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + if (tag.Start) { + bookReader().pushKind(PREFORMATTED); + } else { + bookReader().popKind(); + } + } + bookReader().beginParagraph(); +} + +HtmlListTagAction::HtmlListTagAction(HtmlBookReader &reader, int startIndex) : HtmlTagAction(reader), myStartIndex(startIndex) { +} + +void HtmlListTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myListNumStack.push(myStartIndex); + } else if (!myReader.myListNumStack.empty()) { + myReader.myListNumStack.pop(); + } +} + +HtmlListItemTagAction::HtmlListItemTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlListItemTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + bookReader().endParagraph(); + bookReader().beginParagraph(); + if (!myReader.myListNumStack.empty()) { + bookReader().addFixedHSpace(3 * myReader.myListNumStack.size()); + int &index = myReader.myListNumStack.top(); + if (index == 0) { + myReader.addConvertedDataToBuffer("\342\200\242 ", 4, false); + } else { + std::string number; + ZLStringUtil::appendNumber(number, index++); + number += ". "; + myReader.addConvertedDataToBuffer(number.data(), number.length(), false); + } + myReader.myDontBreakParagraph = true; + } + } else { + myReader.myDontBreakParagraph = false; + } +} + +HtmlTableTagAction::HtmlTableTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlTableTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + myReader.myIgnoreTitles = true; + } else { + myReader.myIgnoreTitles = false; + } +} + +HtmlStyleTagAction::HtmlStyleTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void HtmlStyleTagAction::run(const HtmlReader::HtmlTag &tag) { + myReader.myStyleSheetParser = tag.Start ? new StyleSheetTableParser(myReader.myStyleSheetTable) : 0; + /* + if (!tag.Start) { + myReader.myStyleSheetTable.dump(); + } + */ +} + +shared_ptr<HtmlTagAction> HtmlBookReader::createAction(const std::string &tag) { + if (tag == "EM") { + return new HtmlControlTagAction(*this, EMPHASIS); + } else if (tag == "STRONG") { + return new HtmlControlTagAction(*this, STRONG); + } else if (tag == "B") { + return new HtmlControlTagAction(*this, BOLD); + } else if (tag == "I") { + return new HtmlControlTagAction(*this, ITALIC); + } else if (tag == "TT") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CODE") { + return new HtmlControlTagAction(*this, CODE); + } else if (tag == "CITE") { + return new HtmlControlTagAction(*this, CITE); + } else if (tag == "SUB") { + return new HtmlControlTagAction(*this, SUB); + } else if (tag == "SUP") { + return new HtmlControlTagAction(*this, SUP); + } else if (tag == "H1") { + return new HtmlHeaderTagAction(*this, H1); + } else if (tag == "H2") { + return new HtmlHeaderTagAction(*this, H2); + } else if (tag == "H3") { + return new HtmlHeaderTagAction(*this, H3); + } else if (tag == "H4") { + return new HtmlHeaderTagAction(*this, H4); + } else if (tag == "H5") { + return new HtmlHeaderTagAction(*this, H5); + } else if (tag == "H6") { + return new HtmlHeaderTagAction(*this, H6); + } else if (tag == "HEAD") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "TITLE") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "STYLE") { + return new HtmlStyleTagAction(*this); + } else if (tag == "SELECT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "SCRIPT") { + return new HtmlIgnoreTagAction(*this); + } else if (tag == "A") { + return new HtmlHrefTagAction(*this); + } else if (tag == "TD") { + //return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "TR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DIV") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_END); + } else if (tag == "DT") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START); + } else if (tag == "P") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "BR") { + return new HtmlBreakTagAction(*this, HtmlBreakTagAction::BREAK_AT_START_AND_AT_END); + } else if (tag == "IMG") { + return new HtmlImageTagAction(*this); + } else if (tag == "UL") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "MENU") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "DIR") { + return new HtmlListTagAction(*this, 0); + } else if (tag == "OL") { + return new HtmlListTagAction(*this, 1); + } else if (tag == "LI") { + return new HtmlListItemTagAction(*this); + } else if (tag == "PRE") { + if (myProcessPreTag) { + return new HtmlPreTagAction(*this); + } + } else if (tag == "TABLE") { + return new HtmlTableTagAction(*this); + } + /* + } else if (tag == "DD") { + return 0; + } else if (tag == "DL") { + return 0; + } else if (tag == "DFN") { + return 0; + } else if (tag == "SAMP") { + return 0; + } else if (tag == "KBD") { + return 0; + } else if (tag == "VAR") { + return 0; + } else if (tag == "ABBR") { + return 0; + } else if (tag == "ACRONYM") { + return 0; + } else if (tag == "BLOCKQUOTE") { + return 0; + } else if (tag == "Q") { + return 0; + } else if (tag == "INS") { + return 0; + } else if (tag == "DEL") { + return 0; + } else if (tag == "BODY") { + return 0; + */ + return new DummyHtmlTagAction(*this); +} + +void HtmlBookReader::setBuildTableOfContent(bool build) { + myBuildTableOfContent = build; +} + +void HtmlBookReader::setProcessPreTag(bool process) { + myProcessPreTag = process; +} + +HtmlBookReader::HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlReader(encoding), myBookReader(model), myBaseDirPath(baseDirectoryPath), myFormat(format), myBuildTableOfContent(true), myProcessPreTag(true) { +} + +HtmlBookReader::~HtmlBookReader() { +} + +void HtmlBookReader::addConvertedDataToBuffer(const char *text, std::size_t len, bool convert) { + if (len > 0) { + if (myDontBreakParagraph) { + while (len > 0 && std::isspace(*text)) { + --len; + ++text; + } + if (len == 0) { + return; + } + } + if (convert) { + myConverter->convert(myConverterBuffer, text, text + len); + myBookReader.addData(myConverterBuffer); + myBookReader.addContentsData(myConverterBuffer); + myConverterBuffer.erase(); + } else { + std::string strText(text, len); + myBookReader.addData(strText); + myBookReader.addContentsData(strText); + } + myDontBreakParagraph = false; + } +} + +bool HtmlBookReader::tagHandler(const HtmlTag &tag) { + myConverter->reset(); + + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "ID") { + myBookReader.addHyperlinkLabel(tag.Attributes[i].Value); + break; + } + } + shared_ptr<HtmlTagAction> action = myActionMap[tag.Name]; + if (action.isNull()) { + action = createAction(tag.Name); + myActionMap[tag.Name] = action; + } + action->run(tag); + + return true; +} + +void HtmlBookReader::preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert) { + const char *start = text; + const char *end = text + len; + + int breakType = myFormat.breakType(); + if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (*ptr == '\n') { + mySpaceCounter = 0; + if (start < ptr) { + addConvertedDataToBuffer(start, ptr - start, convert); + } else { + static const std::string SPACE = " "; + myBookReader.addData(SPACE); + } + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr + 1; + } else if (mySpaceCounter >= 0) { + if (std::isspace((unsigned char)*ptr)) { + ++mySpaceCounter; + } else { + myBookReader.addFixedHSpace(mySpaceCounter); + mySpaceCounter = -1; + } + } + } + addConvertedDataToBuffer(start, end - start, convert); + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) { + for (const char *ptr = text; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + mySpaceCounter = 0; + } else if (mySpaceCounter >= 0) { + ++mySpaceCounter; + } + } else { + if (mySpaceCounter > myFormat.ignoredIndent()) { + if (ptr - start > mySpaceCounter) { + addConvertedDataToBuffer(start, ptr - start - mySpaceCounter, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + } + start = ptr; + } + mySpaceCounter = -1; + } + } + mySpaceCounter = std::max(mySpaceCounter, 0); + if (end - start > mySpaceCounter) { + addConvertedDataToBuffer(start, end - start - mySpaceCounter, convert); + } + } else if (breakType & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) { + for (const char *ptr = start; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr == '\n') { + ++myBreakCounter; + } + } else { + if (myBreakCounter > 1) { + addConvertedDataToBuffer(start, ptr - start, convert); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + start = ptr; + } + myBreakCounter = 0; + } + } + addConvertedDataToBuffer(start, end - start, convert); + } +} + +bool HtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) { + if (!myStyleSheetParser.isNull()) { + myStyleSheetParser->parse(text, len); + return true; + } + + if (myIgnoreDataCounter != 0) { + return true; + } + + if (myIsPreformatted) { + preformattedCharacterDataHandler(text, len, convert); + return true; + } + + const char *ptr = text; + const char *end = text + len; + if (!myIsStarted) { + for (; ptr != end; ++ptr) { + if (!std::isspace((unsigned char)*ptr)) { + myIsStarted = true; + break; + } + } + } + if (myIsStarted) { + addConvertedDataToBuffer(ptr, end - ptr, convert); + } + return true; +} + +void HtmlBookReader::startDocumentHandler() { + while (!myListNumStack.empty()) { + myListNumStack.pop(); + } + myConverterBuffer.erase(); + myKindList.clear(); + + myBookReader.reset(); + myBookReader.setMainTextModel(); + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + myIgnoreDataCounter = 0; + myIsPreformatted = false; + myDontBreakParagraph = false; + for (std::map<std::string,shared_ptr<HtmlTagAction> >::const_iterator it = myActionMap.begin(); it != myActionMap.end(); ++it) { + it->second->reset(); + } + myIsStarted = false; + myIgnoreTitles = false; + + myStyleSheetParser = 0; + + mySpaceCounter = -1; + myBreakCounter = 0; +} + +void HtmlBookReader::endDocumentHandler() { + myBookReader.endParagraph(); +} + +void HtmlBookReader::setFileName(const std::string fileName) { + myFileName = fileName; +} diff --git a/reader/src/formats/html/HtmlBookReader.h b/reader/src/formats/html/HtmlBookReader.h new file mode 100644 index 0000000..c8d4e32 --- /dev/null +++ b/reader/src/formats/html/HtmlBookReader.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLBOOKREADER_H__ +#define __HTMLBOOKREADER_H__ + +#include <stack> + +#include <shared_ptr.h> + +#include "HtmlReader.h" +#include "../../bookmodel/BookReader.h" +#include "../css/StyleSheetTable.h" + +class BookModel; +class PlainTextFormat; +class StyleSheetParser; + +class HtmlTagAction; + +class HtmlBookReader : public HtmlReader { + +public: + HtmlBookReader(const std::string &baseDirectoryPath, BookModel &model, const PlainTextFormat &format, const std::string &encoding); + ~HtmlBookReader(); + void setFileName(const std::string fileName); + +protected: + virtual shared_ptr<HtmlTagAction> createAction(const std::string &tag); + void setBuildTableOfContent(bool build); + void setProcessPreTag(bool process); + +protected: + void startDocumentHandler(); + void endDocumentHandler(); + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + void preformattedCharacterDataHandler(const char *text, std::size_t len, bool convert); + void addConvertedDataToBuffer(const char *text, std::size_t len, bool convert); + +protected: + BookReader myBookReader; + std::string myBaseDirPath; + +private: + const PlainTextFormat &myFormat; + int myIgnoreDataCounter; + bool myIsPreformatted; + bool myDontBreakParagraph; + + bool myIsStarted; + bool myBuildTableOfContent; + bool myProcessPreTag; + bool myIgnoreTitles; + std::stack<int> myListNumStack; + + StyleSheetTable myStyleSheetTable; + shared_ptr<StyleSheetParser> myStyleSheetParser; + + int mySpaceCounter; + int myBreakCounter; + std::string myConverterBuffer; + + std::map<std::string,shared_ptr<HtmlTagAction> > myActionMap; + std::vector<FBTextKind> myKindList; + + std::string myFileName; + + friend class HtmlTagAction; + friend class HtmlControlTagAction; + friend class HtmlHeaderTagAction; + friend class HtmlIgnoreTagAction; + friend class HtmlHrefTagAction; + friend class HtmlImageTagAction; + friend class HtmlBreakTagAction; + friend class HtmlPreTagAction; + friend class HtmlListTagAction; + friend class HtmlListItemTagAction; + friend class HtmlTableTagAction; + friend class HtmlStyleTagAction; +}; + +#endif /* __HTMLBOOKREADER_H__ */ diff --git a/reader/src/formats/html/HtmlDescriptionReader.cpp b/reader/src/formats/html/HtmlDescriptionReader.cpp new file mode 100644 index 0000000..6ebcb8b --- /dev/null +++ b/reader/src/formats/html/HtmlDescriptionReader.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "HtmlDescriptionReader.h" + +#include "../../library/Book.h" + +HtmlDescriptionReader::HtmlDescriptionReader(Book &book) : HtmlReader(book.encoding()), myBook(book) { + myBook.setTitle(""); +} + +void HtmlDescriptionReader::startDocumentHandler() { + myReadTitle = false; +} + +void HtmlDescriptionReader::endDocumentHandler() { + if (!myBook.title().empty()) { + const char *titleStart = myBook.title().data(); + const char *titleEnd = titleStart + myBook.title().length(); + std::string newTitle; + myConverter->convert(newTitle, titleStart, titleEnd); + myBook.setTitle(newTitle); + } +} + +bool HtmlDescriptionReader::tagHandler(const HtmlTag &tag) { + if (tag.Name == "TITLE") { + if (myReadTitle && !tag.Start) { + myBook.setTitle(myBuffer); + myBuffer.erase(); + } + myReadTitle = tag.Start && myBook.title().empty(); + return true; + } else if (tag.Start && tag.Name == "META") { + std::vector<HtmlAttribute>::const_iterator it = tag.Attributes.begin(); + for (; it != tag.Attributes.end(); ++it) { + if (it->Name == "CONTENT") { + break; + } + } + if (it != tag.Attributes.end()) { + const std::string prefix = "charset="; + std::size_t index = it->Value.find(prefix); + if (index != std::string::npos) { + std::string charset = it->Value.substr(index + prefix.length()); + index = charset.find(';'); + if (index != std::string::npos) { + charset = charset.substr(0, index); + } + index = charset.find(' '); + if (index != std::string::npos) { + charset = charset.substr(0, index); + } + myBook.setEncoding(charset); + } + } + } + return tag.Name != "BODY"; +} + +bool HtmlDescriptionReader::characterDataHandler(const char *text, std::size_t len, bool) { + if (myReadTitle) { + myBuffer.append(text, len); + } + return true; +} diff --git a/reader/src/formats/html/HtmlDescriptionReader.h b/reader/src/formats/html/HtmlDescriptionReader.h new file mode 100644 index 0000000..159d4b0 --- /dev/null +++ b/reader/src/formats/html/HtmlDescriptionReader.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLDESCRIPTIONREADER_H__ +#define __HTMLDESCRIPTIONREADER_H__ + +#include "HtmlReader.h" + +class Book; + +class HtmlDescriptionReader : public HtmlReader { + +public: + HtmlDescriptionReader(Book &book); + ~HtmlDescriptionReader(); + +protected: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + bool myReadTitle; + std::string myBuffer; + Book &myBook; +}; + +inline HtmlDescriptionReader::~HtmlDescriptionReader() {} + +#endif /* __HTMLDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/html/HtmlEntityCollection.cpp b/reader/src/formats/html/HtmlEntityCollection.cpp new file mode 100644 index 0000000..bd1bb4e --- /dev/null +++ b/reader/src/formats/html/HtmlEntityCollection.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cctype> + +#include <ZLibrary.h> +#include <ZLFile.h> +#include <ZLXMLReader.h> + +#include "HtmlEntityCollection.h" + +class CollectionReader : public ZLXMLReader { + +public: + CollectionReader(std::map<std::string,int> &collection); + void startElementHandler(const char *tag, const char **attributes); + +private: + std::map<std::string,int> &myCollection; +}; + +std::map<std::string,int> HtmlEntityCollection::ourCollection; + +int HtmlEntityCollection::symbolNumber(const std::string &name) { + if (ourCollection.empty()) { + CollectionReader(ourCollection).readDocument(ZLFile( + ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter + + "formats" + ZLibrary::FileNameDelimiter + + "html" + ZLibrary::FileNameDelimiter + "html.ent" + )); + } + std::map<std::string,int>::const_iterator it = ourCollection.find(name); + return it == ourCollection.end() ? 0 : it->second; +} + +CollectionReader::CollectionReader(std::map<std::string,int> &collection) : myCollection(collection) { +} + +void CollectionReader::startElementHandler(const char *tag, const char **attributes) { + static const std::string ENTITY = "entity"; + + if (ENTITY == tag) { + for (int i = 0; i < 4; ++i) { + if (attributes[i] == 0) { + return; + } + } + static const std::string _name = "name"; + static const std::string _number = "number"; + if (_name == attributes[0] && _number == attributes[2]) { + myCollection[attributes[1]] = std::atoi(attributes[3]); + } + } +} diff --git a/reader/src/formats/html/HtmlEntityCollection.h b/reader/src/formats/html/HtmlEntityCollection.h new file mode 100644 index 0000000..6f70491 --- /dev/null +++ b/reader/src/formats/html/HtmlEntityCollection.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLENTITYCOLLECTION_H__ +#define __HTMLENTITYCOLLECTION_H__ + +#include <string> +#include <map> + +class HtmlEntityCollection { + +public: + static int symbolNumber(const std::string &name); + +private: + static std::map<std::string,int> ourCollection; + +private: + HtmlEntityCollection(); +}; + +#endif /* __HTMLENTITYCOLLECTION_H__ */ diff --git a/reader/src/formats/html/HtmlPlugin.cpp b/reader/src/formats/html/HtmlPlugin.cpp new file mode 100644 index 0000000..279e096 --- /dev/null +++ b/reader/src/formats/html/HtmlPlugin.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLStringUtil.h> +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "HtmlPlugin.h" +#include "HtmlDescriptionReader.h" +#include "HtmlBookReader.h" +#include "HtmlReaderStream.h" +#include "../txt/PlainTextFormat.h" +#include "../util/MiscUtil.h" +#include "../../library/Book.h" +#include "../../bookmodel/BookModel.h" + +bool HtmlPlugin::acceptsFile(const ZLFile &file) const { + const std::string &extension = file.extension(); + return ZLStringUtil::stringEndsWith(extension, "html") || (extension == "htm"); +} + +bool HtmlPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = book.file().inputStream(); + if (stream.isNull()) { + return false; + } + + shared_ptr<ZLInputStream> htmlStream = new HtmlReaderStream(stream, 50000); + detectEncodingAndLanguage(book, *htmlStream); + if (book.encoding().empty()) { + return false; + } + HtmlDescriptionReader(book).readDocument(*stream); + + return true; +} + +bool HtmlPlugin::readModel(BookModel &model) const { + const Book& book = *model.book(); + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull()) { + return false; + } + + PlainTextFormat format(file); + if (!format.initialized()) { + PlainTextFormatDetector detector; + detector.detect(*stream, format); + } + + std::string directoryPrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + HtmlBookReader reader(directoryPrefix, model, format, book.encoding()); + reader.setFileName(MiscUtil::htmlFileName(file.path())); + reader.readDocument(*stream); + + return true; +} + +FormatInfoPage *HtmlPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + return new PlainTextInfoPage(dialog, file, ZLResourceKey("<PRE>"), false); +} + +bool HtmlPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} diff --git a/reader/src/formats/html/HtmlPlugin.h b/reader/src/formats/html/HtmlPlugin.h new file mode 100644 index 0000000..c66a108 --- /dev/null +++ b/reader/src/formats/html/HtmlPlugin.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLPLUGIN_H__ +#define __HTMLPLUGIN_H__ + +#include "../FormatPlugin.h" + +class HtmlPlugin : public FormatPlugin { + +public: + HtmlPlugin(); + ~HtmlPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +inline HtmlPlugin::HtmlPlugin() {} +inline HtmlPlugin::~HtmlPlugin() {} +inline bool HtmlPlugin::providesMetaInfo() const { return false; } + +#endif /* __HTMLPLUGIN_H__ */ diff --git a/reader/src/formats/html/HtmlReader.cpp b/reader/src/formats/html/HtmlReader.cpp new file mode 100644 index 0000000..a5ce7fa --- /dev/null +++ b/reader/src/formats/html/HtmlReader.cpp @@ -0,0 +1,373 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> +#include <cctype> + +#include <ZLInputStream.h> +#include <ZLXMLReader.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> + +#include "HtmlReader.h" +#include "HtmlEntityCollection.h" + +HtmlReader::HtmlReader(const std::string &encoding) : EncodedTextReader(encoding) { +} + +HtmlReader::~HtmlReader() { +} + +void HtmlReader::setTag(HtmlTag &tag, const std::string &name) { + tag.Attributes.clear(); + + if (name.length() == 0) { + tag.Name = name; + return; + } + + tag.Start = name[0] != '/'; + if (tag.Start) { + tag.Name = name; + } else { + tag.Name = name.substr(1); + } + + const std::size_t len = tag.Name.length(); + for (std::size_t i = 0; i < len; ++i) { + tag.Name[i] = std::toupper(tag.Name[i]); + } +} + +enum ParseState { + PS_TEXT, + PS_TAGSTART, + PS_TAGNAME, + PS_WAIT_END_OF_TAG, + PS_ATTRIBUTENAME, + PS_ATTRIBUTEVALUE, + PS_SKIPTAG, + PS_COMMENT, + PS_SPECIAL, + PS_SPECIAL_IN_ATTRIBUTEVALUE, +}; + +enum SpecialType { + ST_UNKNOWN, + ST_NUM, + ST_NAME, + ST_DEC, + ST_HEX +}; + +static bool allowSymbol(SpecialType type, char ch) { + return + (type == ST_NAME && std::isalpha(ch)) || + (type == ST_DEC && std::isdigit(ch)) || + (type == ST_HEX && std::isxdigit(ch)); +} + +static int specialSymbolNumber(SpecialType type, const std::string &txt) { + char *end = 0; + switch (type) { + case ST_NAME: + return HtmlEntityCollection::symbolNumber(txt); + case ST_DEC: + return std::strtol(txt.c_str() + 1, &end, 10); + case ST_HEX: + return std::strtol(txt.c_str() + 2, &end, 16); + default: + return 0; + } +} + +void HtmlReader::appendString(std::string &to, std::string &from) { + if (myConverter.isNull()) { + to += from; + } else { + myConverter->convert(to, from); + myConverter->reset(); + } + from.erase(); +} + +void HtmlReader::readDocument(ZLInputStream &stream) { + if (!stream.open()) { + return; + } + + startDocumentHandler(); + + ParseState state = PS_TEXT; + SpecialType state_special = ST_UNKNOWN; + std::string currentString; + std::string attributeValueString; + std::string specialString; + int quotationCounter = 0; + HtmlTag currentTag; + char endOfComment[2] = "\0"; + + const std::size_t BUFSIZE = 2048; + char *buffer = new char[BUFSIZE]; + std::size_t length; + std::size_t offset = 0; + do { + length = stream.read(buffer, BUFSIZE); + char *start = buffer; + char *endOfBuffer = buffer + length; + for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { + switch (state) { + case PS_TEXT: + if (*ptr == '<') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_TAGSTART; + currentTag.Offset = offset + (ptr - buffer); + } + if (*ptr == '&') { + if (!characterDataHandler(start, ptr - start, true)) { + goto endOfProcessing; + } + start = ptr + 1; + state = PS_SPECIAL; + state_special = ST_UNKNOWN; + } + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + if (state_special == ST_UNKNOWN) { + if (*ptr == '#') { + state_special = ST_NUM; + } else if (std::isalpha(*ptr)) { + state_special = ST_NAME; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else if (state_special == ST_NUM) { + if (*ptr == 'x') { + state_special = ST_HEX; + } else if (std::isdigit(*ptr)) { + state_special = ST_DEC; + } else { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } else { + if (*ptr == ';') { + specialString.append(start, ptr - start); + int number = specialSymbolNumber(state_special, specialString); + if ((128 <= number) && (number <= 159)) { + char ch = number; + if (state == PS_SPECIAL) { + characterDataHandler(&ch, 1, true); + } else { + myConverter->convert(attributeValueString, &ch, &ch + 1); + } + } else if (number != 0) { + char buffer[4]; + int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); + if (state == PS_SPECIAL) { + characterDataHandler(buffer, len, false); + } else { + attributeValueString.append(buffer, len); + } + } else { + specialString = "&" + specialString + ";"; + if (state == PS_SPECIAL) { + characterDataHandler(specialString.c_str(), specialString.length(), false); + } else { + attributeValueString += specialString; + } + } + specialString.erase(); + start = ptr + 1; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } else if (!allowSymbol(state_special, *ptr)) { + start = ptr; + state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; + } + } + break; + case PS_TAGSTART: + state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; + break; + case PS_COMMENT: + if ((endOfComment[0] == '\0') && (*ptr != '-')) { + state = PS_TAGNAME; + } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { + start = ptr + 1; + state = PS_TEXT; + endOfComment[0] = '\0'; + endOfComment[1] = '\0'; + } else { + endOfComment[0] = endOfComment[1]; + endOfComment[1] = *ptr; + } + break; + case PS_WAIT_END_OF_TAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + case PS_TAGNAME: + if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { + currentString.append(start, ptr - start); + start = ptr + 1; + setTag(currentTag, currentString); + currentString.erase(); + if (currentTag.Name == "") { + state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; + } else { + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + } + break; + case PS_ATTRIBUTENAME: + if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + for (unsigned int i = 0; i < currentString.length(); ++i) { + currentString[i] = std::toupper(currentString[i]); + } + currentTag.addAttribute(currentString); + currentString.erase(); + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; + } + } + break; + case PS_ATTRIBUTEVALUE: + if (*ptr == '"') { + if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { + ++quotationCounter; + } + } else if (*ptr == '&') { + currentString.append(start, ptr - start); + start = ptr + 1; + appendString(attributeValueString, currentString); + state = PS_SPECIAL_IN_ATTRIBUTEVALUE; + state_special = ST_UNKNOWN; + } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { + if (ptr != start || !currentString.empty()) { + currentString.append(start, ptr - start); + appendString(attributeValueString, currentString); + if (attributeValueString[0] == '"') { + attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); + } + currentTag.setLastAttributeValue(attributeValueString); + attributeValueString.erase(); + quotationCounter = 0; + } + start = ptr + 1; + if (*ptr == '>') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_TEXT; + } else if (*ptr == '/') { + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + currentTag.Start = false; + if (!tagHandler(currentTag)) { + goto endOfProcessing; + } + state = PS_WAIT_END_OF_TAG; + } else { + state = PS_ATTRIBUTENAME; + } + } + break; + case PS_SKIPTAG: + if (*ptr == '>') { + start = ptr + 1; + state = PS_TEXT; + } + break; + } + } + if (start != endOfBuffer) { + switch (state) { + case PS_TEXT: + if (!characterDataHandler(start, endOfBuffer - start, true)) { + goto endOfProcessing; + } + break; + case PS_TAGNAME: + case PS_ATTRIBUTENAME: + case PS_ATTRIBUTEVALUE: + currentString.append(start, endOfBuffer - start); + break; + case PS_SPECIAL: + case PS_SPECIAL_IN_ATTRIBUTEVALUE: + specialString.append(start, endOfBuffer - start); + break; + case PS_TAGSTART: + case PS_SKIPTAG: + case PS_COMMENT: + case PS_WAIT_END_OF_TAG: + break; + } + } + offset += length; + } while (length == BUFSIZE); +endOfProcessing: + delete[] buffer; + + endDocumentHandler(); + + stream.close(); +} diff --git a/reader/src/formats/html/HtmlReader.h b/reader/src/formats/html/HtmlReader.h new file mode 100644 index 0000000..876fad8 --- /dev/null +++ b/reader/src/formats/html/HtmlReader.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLREADER_H__ +#define __HTMLREADER_H__ + +#include <string> +#include <vector> + +#include <ZLEncodingConverter.h> +#include "../EncodedTextReader.h" + +class ZLInputStream; + +class HtmlReader : public EncodedTextReader { + +public: + struct HtmlAttribute { + std::string Name; + std::string Value; + bool HasValue; + + HtmlAttribute(const std::string &name); + ~HtmlAttribute(); + void setValue(const std::string &value); + }; + + struct HtmlTag { + std::string Name; + std::size_t Offset; + bool Start; + std::vector<HtmlAttribute> Attributes; + + HtmlTag(); + ~HtmlTag(); + void addAttribute(const std::string &name); + void setLastAttributeValue(const std::string &value); + + private: + HtmlTag(const HtmlTag&); + const HtmlTag &operator = (const HtmlTag&); + }; + +private: + static void setTag(HtmlTag &tag, const std::string &fullName); + +public: + virtual void readDocument(ZLInputStream &stream); + +protected: + HtmlReader(const std::string &encoding); + virtual ~HtmlReader(); + +protected: + virtual void startDocumentHandler() = 0; + virtual void endDocumentHandler() = 0; + + // returns false iff processing must be stopped + virtual bool tagHandler(const HtmlTag &tag) = 0; + // returns false iff processing must be stopped + virtual bool characterDataHandler(const char *text, std::size_t len, bool convert) = 0; + +private: + void appendString(std::string &to, std::string &from); +}; + +inline HtmlReader::HtmlAttribute::HtmlAttribute(const std::string &name) : Name(name), HasValue(false) {} +inline HtmlReader::HtmlAttribute::~HtmlAttribute() {} +inline void HtmlReader::HtmlAttribute::setValue(const std::string &value) { Value = value; HasValue = true; } + +inline HtmlReader::HtmlTag::HtmlTag() : Start(true) {} +inline HtmlReader::HtmlTag::~HtmlTag() {} +inline void HtmlReader::HtmlTag::addAttribute(const std::string &name) { Attributes.push_back(HtmlAttribute(name)); } +inline void HtmlReader::HtmlTag::setLastAttributeValue(const std::string &value) { if (!Attributes.empty()) Attributes.back().setValue(value); } + +#endif /* __HTMLREADER_H__ */ diff --git a/reader/src/formats/html/HtmlReaderStream.cpp b/reader/src/formats/html/HtmlReaderStream.cpp new file mode 100644 index 0000000..08c43ae --- /dev/null +++ b/reader/src/formats/html/HtmlReaderStream.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cstring> +#include <algorithm> + +#include "HtmlReaderStream.h" +#include "HtmlReader.h" + +class HtmlTextOnlyReader : public HtmlReader { + +public: + HtmlTextOnlyReader(char *buffer, std::size_t maxSize); + std::size_t size() const; + +private: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + char *myBuffer; + std::size_t myMaxSize; + std::size_t myFilledSize; + bool myIgnoreText; +}; + +HtmlTextOnlyReader::HtmlTextOnlyReader(char *buffer, std::size_t maxSize) : HtmlReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0), myIgnoreText(false) { +} + +std::size_t HtmlTextOnlyReader::size() const { + return myFilledSize; +} + +void HtmlTextOnlyReader::startDocumentHandler() { +} + +void HtmlTextOnlyReader::endDocumentHandler() { +} + +bool HtmlTextOnlyReader::tagHandler(const HtmlTag &tag) { + if (tag.Name == "SCRIPT") { + myIgnoreText = tag.Start; + } + if ((myFilledSize < myMaxSize) && (myFilledSize > 0) && (myBuffer[myFilledSize - 1] != '\n')) { + myBuffer[myFilledSize++] = '\n'; + } + return myFilledSize < myMaxSize; +} + +bool HtmlTextOnlyReader::characterDataHandler(const char *text, std::size_t len, bool) { + if (!myIgnoreText) { + len = std::min((std::size_t)len, myMaxSize - myFilledSize); + std::memcpy(myBuffer + myFilledSize, text, len); + myFilledSize += len; + } + return myFilledSize < myMaxSize; +} + +HtmlReaderStream::HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize) : myBase(base), myBuffer(0), mySize(maxSize) { +} + +HtmlReaderStream::~HtmlReaderStream() { + close(); +} + +bool HtmlReaderStream::open() { + if (myBase.isNull() || !myBase->open()) { + return false; + } + myBuffer = new char[mySize]; + HtmlTextOnlyReader reader(myBuffer, mySize); + reader.readDocument(*myBase); + mySize = reader.size(); + myOffset = 0; + myBase->close(); + return true; +} + +std::size_t HtmlReaderStream::read(char *buffer, std::size_t maxSize) { + maxSize = std::min(maxSize, mySize - myOffset); + if (buffer != 0) { + std::memcpy(buffer, myBuffer, maxSize); + } + myOffset += maxSize; + return maxSize; +} + +void HtmlReaderStream::close() { + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void HtmlReaderStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min(mySize, (std::size_t)std::max(0, offset)); +} + +std::size_t HtmlReaderStream::offset() const { + return myOffset; +} + +std::size_t HtmlReaderStream::sizeOfOpened() { + return mySize; +} diff --git a/reader/src/formats/html/HtmlReaderStream.h b/reader/src/formats/html/HtmlReaderStream.h new file mode 100644 index 0000000..c5c15b8 --- /dev/null +++ b/reader/src/formats/html/HtmlReaderStream.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLREADERSTREAM_H__ +#define __HTMLREADERSTREAM_H__ + +#include <shared_ptr.h> +#include <ZLInputStream.h> + +class HtmlReaderStream : public ZLInputStream { + +public: + HtmlReaderStream(shared_ptr<ZLInputStream> base, std::size_t maxSize); + ~HtmlReaderStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + shared_ptr<ZLInputStream> myBase; + char *myBuffer; + std::size_t mySize; + std::size_t myOffset; +}; + +#endif /* __HTMLREADERSTREAM_H__ */ diff --git a/reader/src/formats/html/HtmlTagActions.h b/reader/src/formats/html/HtmlTagActions.h new file mode 100644 index 0000000..7da3f20 --- /dev/null +++ b/reader/src/formats/html/HtmlTagActions.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLTAGACTIONS_H__ +#define __HTMLTAGACTIONS_H__ + +#include <set> + +#include "HtmlBookReader.h" + +class HtmlTagAction { + +protected: + HtmlTagAction(HtmlBookReader &reader); + +public: + virtual ~HtmlTagAction(); + virtual void run(const HtmlReader::HtmlTag &tag) = 0; + virtual void reset(); + +protected: + BookReader &bookReader(); + +protected: + HtmlBookReader &myReader; +}; + +class DummyHtmlTagAction : public HtmlTagAction { + +public: + DummyHtmlTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlControlTagAction : public HtmlTagAction { + +public: + HtmlControlTagAction(HtmlBookReader &reader, FBTextKind kind); + void run(const HtmlReader::HtmlTag &tag); + +private: + FBTextKind myKind; +}; + +class HtmlHeaderTagAction : public HtmlTagAction { + +public: + HtmlHeaderTagAction(HtmlBookReader &reader, FBTextKind kind); + void run(const HtmlReader::HtmlTag &tag); + +private: + FBTextKind myKind; +}; + +class HtmlIgnoreTagAction : public HtmlTagAction { + +public: + HtmlIgnoreTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); + +private: + std::set<std::string> myTagNames; +}; + +class HtmlHrefTagAction : public HtmlTagAction { + +public: + HtmlHrefTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); + void reset(); + +protected: + FBTextKind hyperlinkType() const; + void setHyperlinkType(FBTextKind hyperlinkType); + +private: + FBTextKind myHyperlinkType; +}; + +class HtmlImageTagAction : public HtmlTagAction { + +public: + HtmlImageTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlBreakTagAction : public HtmlTagAction { + +public: + enum BreakType { + BREAK_AT_START = 1, + BREAK_AT_END = 2, + BREAK_AT_START_AND_AT_END = BREAK_AT_START | BREAK_AT_END + }; + HtmlBreakTagAction(HtmlBookReader &reader, BreakType breakType); + void run(const HtmlReader::HtmlTag &tag); + +private: + BreakType myBreakType; +}; + +class HtmlPreTagAction : public HtmlTagAction { + +public: + HtmlPreTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlListTagAction : public HtmlTagAction { + +public: + HtmlListTagAction(HtmlBookReader &reader, int startIndex); + void run(const HtmlReader::HtmlTag &tag); + +private: + int myStartIndex; +}; + +class HtmlListItemTagAction : public HtmlTagAction { + +public: + HtmlListItemTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlTableTagAction : public HtmlTagAction { + +public: + HtmlTableTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class HtmlStyleTagAction : public HtmlTagAction { + +public: + HtmlStyleTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +inline BookReader &HtmlTagAction::bookReader() { return myReader.myBookReader; } + +#endif /* __HTMLTAGACTIONS_H__ */ diff --git a/reader/src/formats/oeb/NCXReader.cpp b/reader/src/formats/oeb/NCXReader.cpp new file mode 100644 index 0000000..e824e16 --- /dev/null +++ b/reader/src/formats/oeb/NCXReader.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> + +#include "NCXReader.h" +#include "../util/MiscUtil.h" +#include "../util/EntityFilesCollector.h" + +NCXReader::NCXReader(BookReader &modelReader) : myModelReader(modelReader), myReadState(READ_NONE), myPlayIndex(-65535) { +} + +static const std::string TAG_NAVMAP = "navMap"; +static const std::string TAG_NAVPOINT = "navPoint"; +static const std::string TAG_NAVLABEL = "navLabel"; +static const std::string TAG_CONTENT = "content"; +static const std::string TAG_TEXT = "text"; + +void NCXReader::startElementHandler(const char *fullTag, const char **attributes) { + std::string tag = fullTag; + const std::size_t index = tag.rfind(':'); + if (index != std::string::npos) { + tag = tag.substr(index + 1); + } + switch (myReadState) { + case READ_NONE: + if (TAG_NAVMAP == tag) { + myReadState = READ_MAP; + } + break; + case READ_MAP: + if (TAG_NAVPOINT == tag) { + const char *order = attributeValue(attributes, "playOrder"); + myPointStack.push_back(NavPoint(order != 0 ? std::atoi(order) : myPlayIndex++, myPointStack.size())); + myReadState = READ_POINT; + } + break; + case READ_POINT: + if (TAG_NAVPOINT == tag) { + const char *order = attributeValue(attributes, "playOrder"); + myPointStack.push_back(NavPoint(order != 0 ? std::atoi(order) : myPlayIndex++, myPointStack.size())); + } else if (TAG_NAVLABEL == tag) { + myReadState = READ_LABEL; + } else if (TAG_CONTENT == tag) { + const char *src = attributeValue(attributes, "src"); + if (src != 0) { + myPointStack.back().ContentHRef = MiscUtil::decodeHtmlURL(src); + } + } + break; + case READ_LABEL: + if (TAG_TEXT == tag) { + myReadState = READ_TEXT; + } + break; + case READ_TEXT: + break; + } +} + +void NCXReader::endElementHandler(const char *fullTag) { + std::string tag = fullTag; + const std::size_t index = tag.rfind(':'); + if (index != std::string::npos) { + tag = tag.substr(index + 1); + } + switch (myReadState) { + case READ_NONE: + break; + case READ_MAP: + if (TAG_NAVMAP == tag) { + myReadState = READ_NONE; + } + break; + case READ_POINT: + if (TAG_NAVPOINT == tag) { + if (myPointStack.back().Text.empty()) { + myPointStack.back().Text = "..."; + } + myNavigationMap[myPointStack.back().Order] = myPointStack.back(); + myPointStack.pop_back(); + myReadState = myPointStack.empty() ? READ_MAP : READ_POINT; + } + case READ_LABEL: + if (TAG_NAVLABEL == tag) { + myReadState = READ_POINT; + } + break; + case READ_TEXT: + if (TAG_TEXT == tag) { + myReadState = READ_LABEL; + } + break; + } +} + +void NCXReader::characterDataHandler(const char *text, std::size_t len) { + if (myReadState == READ_TEXT) { + myPointStack.back().Text.append(text, len); + } +} + +const std::vector<std::string> &NCXReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("xhtml"); +} + +const std::map<int,NCXReader::NavPoint> &NCXReader::navigationMap() const { + return myNavigationMap; +} + +NCXReader::NavPoint::NavPoint() { +} + +NCXReader::NavPoint::NavPoint(int order, std::size_t level) : Order(order), Level(level) { +} diff --git a/reader/src/formats/oeb/NCXReader.h b/reader/src/formats/oeb/NCXReader.h new file mode 100644 index 0000000..c10d2ab --- /dev/null +++ b/reader/src/formats/oeb/NCXReader.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __NCXREADER_H__ +#define __NCXREADER_H__ + +#include <map> +#include <vector> + +#include <ZLXMLReader.h> + +#include "../../bookmodel/BookReader.h" + +class NCXReader : public ZLXMLReader { + +public: + struct NavPoint { + NavPoint(); + NavPoint(int order, std::size_t level); + + int Order; + std::size_t Level; + std::string Text; + std::string ContentHRef; + }; + +public: + NCXReader(BookReader &modelReader); + const std::map<int,NavPoint> &navigationMap() const; + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + void characterDataHandler(const char *text, std::size_t len); + const std::vector<std::string> &externalDTDs() const; + +private: + BookReader &myModelReader; + std::map<int,NavPoint> myNavigationMap; + std::vector<NavPoint> myPointStack; + + enum { + READ_NONE, + READ_MAP, + READ_POINT, + READ_LABEL, + READ_TEXT + } myReadState; + + int myPlayIndex; +}; + +#endif /* __NCXREADER_H__ */ diff --git a/reader/src/formats/oeb/OEBBookReader.cpp b/reader/src/formats/oeb/OEBBookReader.cpp new file mode 100644 index 0000000..c4234a7 --- /dev/null +++ b/reader/src/formats/oeb/OEBBookReader.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> + +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLXMLNamespace.h> + +#include "OEBBookReader.h" +#include "XHTMLImageFinder.h" +#include "NCXReader.h" +#include "../xhtml/XHTMLReader.h" +#include "../util/MiscUtil.h" +#include "../util/EntityFilesCollector.h" +#include "../../bookmodel/BookModel.h" + +OEBBookReader::OEBBookReader(BookModel &model) : myModelReader(model) { +} + +static const std::string MANIFEST = "manifest"; +static const std::string SPINE = "spine"; +static const std::string GUIDE = "guide"; +static const std::string TOUR = "tour"; +static const std::string SITE = "site"; + +static const std::string ITEM = "item"; +static const std::string ITEMREF = "itemref"; +static const std::string REFERENCE = "reference"; + +static const std::string COVER = "cover"; +static const std::string COVER_IMAGE = "other.ms-coverimage-standard"; + +bool OEBBookReader::isOPFTag(const std::string &expected, const std::string &tag) const { + return expected == tag || testTag(ZLXMLNamespace::OpenPackagingFormat, expected, tag); +} + +void OEBBookReader::startElementHandler(const char *tag, const char **xmlattributes) { + std::string tagString = ZLUnicodeUtil::toLower(tag); + + switch (myState) { + case READ_NONE: + if (isOPFTag(MANIFEST, tagString)) { + myState = READ_MANIFEST; + } else if (isOPFTag(SPINE, tagString)) { + const char *toc = attributeValue(xmlattributes, "toc"); + if (toc != 0) { + myNCXTOCFileName = myIdToHref[toc]; + } + myState = READ_SPINE; + } else if (isOPFTag(GUIDE, tagString)) { + myState = READ_GUIDE; + } else if (isOPFTag(TOUR, tagString)) { + myState = READ_TOUR; + } + break; + case READ_MANIFEST: + if (isOPFTag(ITEM, tagString)) { + const char *href = attributeValue(xmlattributes, "href"); + if (href != 0) { + const std::string sHref = MiscUtil::decodeHtmlURL(href); + const char *id = attributeValue(xmlattributes, "id"); + const char *mediaType = attributeValue(xmlattributes, "media-type"); + if (id != 0) { + myIdToHref[id] = sHref; + } + if (mediaType != 0) { + myHrefToMediatype[sHref] = mediaType; + } + } + } + break; + case READ_SPINE: + if (isOPFTag(ITEMREF, tagString)) { + const char *id = attributeValue(xmlattributes, "idref"); + if (id != 0) { + const std::string &fileName = myIdToHref[id]; + if (!fileName.empty()) { + myHtmlFileNames.push_back(fileName); + } + } + } + break; + case READ_GUIDE: + if (isOPFTag(REFERENCE, tagString)) { + const char *type = attributeValue(xmlattributes, "type"); + const char *title = attributeValue(xmlattributes, "title"); + const char *href = attributeValue(xmlattributes, "href"); + if (href != 0) { + const std::string reference = MiscUtil::decodeHtmlURL(href); + if (title != 0) { + myGuideTOC.push_back(std::make_pair(std::string(title), reference)); + } + if (type != 0) { + if (COVER == type) { + ZLFile imageFile(myFilePrefix + reference); + myCoverFileName = imageFile.path(); + const std::map<std::string,std::string>::const_iterator it = + myHrefToMediatype.find(reference); + const std::string mimeType = + it != myHrefToMediatype.end() ? it->second : std::string(); + shared_ptr<const ZLImage> image; + if (ZLStringUtil::stringStartsWith(mimeType, "image/")) { + image = new ZLFileImage(imageFile, 0); + } else { + image = XHTMLImageFinder().readImage(imageFile); + } + if (!image.isNull()) { + const std::string imageName = imageFile.name(false); + myModelReader.setMainTextModel(); + myModelReader.addImageReference(imageName, 0); + myModelReader.addImage(imageName, image); + myModelReader.insertEndOfSectionParagraph(); + } else { + myCoverFileName.erase(); + } + } else if (COVER_IMAGE == type) { + ZLFile imageFile(myFilePrefix + reference); + myCoverFileName = imageFile.path(); + const std::string imageName = imageFile.name(false); + myModelReader.setMainTextModel(); + myModelReader.addImageReference(imageName, 0); + myModelReader.addImage(imageName, new ZLFileImage(imageFile, 0)); + myModelReader.insertEndOfSectionParagraph(); + } + } + } + } + break; + case READ_TOUR: + if (isOPFTag(SITE, tagString)) { + const char *title = attributeValue(xmlattributes, "title"); + const char *href = attributeValue(xmlattributes, "href"); + if ((title != 0) && (href != 0)) { + myTourTOC.push_back(std::make_pair(title, MiscUtil::decodeHtmlURL(href))); + } + } + break; + } +} + +void OEBBookReader::endElementHandler(const char *tag) { + std::string tagString = ZLUnicodeUtil::toLower(tag); + + switch (myState) { + case READ_MANIFEST: + if (isOPFTag(MANIFEST, tagString)) { + myState = READ_NONE; + } + break; + case READ_SPINE: + if (isOPFTag(SPINE, tagString)) { + myState = READ_NONE; + } + break; + case READ_GUIDE: + if (isOPFTag(GUIDE, tagString)) { + myState = READ_NONE; + } + break; + case READ_TOUR: + if (isOPFTag(TOUR, tagString)) { + myState = READ_NONE; + } + break; + case READ_NONE: + break; + } +} + +bool OEBBookReader::readBook(const ZLFile &file) { + myFilePrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + + myIdToHref.clear(); + myHtmlFileNames.clear(); + myNCXTOCFileName.erase(); + myCoverFileName.erase(); + myTourTOC.clear(); + myGuideTOC.clear(); + myState = READ_NONE; + + if (!readDocument(file)) { + return false; + } + + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + + XHTMLReader xhtmlReader(myModelReader); + bool firstFile = true; + for (std::vector<std::string>::const_iterator it = myHtmlFileNames.begin(); it != myHtmlFileNames.end(); ++it) { + const ZLFile xhtmlFile(myFilePrefix + *it); + if (firstFile && myCoverFileName == xhtmlFile.path()) { + continue; + } + if (!firstFile) { + myModelReader.insertEndOfSectionParagraph(); + } + xhtmlReader.readFile(xhtmlFile, *it); + firstFile = false; + } + + generateTOC(xhtmlReader); + + return true; +} + +void OEBBookReader::generateTOC(const XHTMLReader &xhtmlReader) { + if (!myNCXTOCFileName.empty()) { + NCXReader ncxReader(myModelReader); + if (ncxReader.readDocument(ZLFile(myFilePrefix + myNCXTOCFileName))) { + const std::map<int,NCXReader::NavPoint> navigationMap = ncxReader.navigationMap(); + if (!navigationMap.empty()) { + std::size_t level = 0; + for (std::map<int,NCXReader::NavPoint>::const_iterator it = navigationMap.begin(); it != navigationMap.end(); ++it) { + const NCXReader::NavPoint &point = it->second; + int index = myModelReader.model().label(xhtmlReader.normalizedReference(point.ContentHRef)).ParagraphNumber; + while (level > point.Level) { + myModelReader.endContentsParagraph(); + --level; + } + while (++level <= point.Level) { + myModelReader.beginContentsParagraph(-2); + myModelReader.addContentsData("..."); + } + myModelReader.beginContentsParagraph(index); + myModelReader.addContentsData(point.Text); + } + while (level > 0) { + myModelReader.endContentsParagraph(); + --level; + } + return; + } + } + } + + std::vector<std::pair<std::string,std::string> > &toc = myTourTOC.empty() ? myGuideTOC : myTourTOC; + for (std::vector<std::pair<std::string,std::string> >::const_iterator it = toc.begin(); it != toc.end(); ++it) { + int index = myModelReader.model().label(it->second).ParagraphNumber; + if (index != -1) { + myModelReader.beginContentsParagraph(index); + myModelReader.addContentsData(it->first); + myModelReader.endContentsParagraph(); + } + } +} + +bool OEBBookReader::processNamespaces() const { + return true; +} + +const std::vector<std::string> &OEBBookReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("xhtml"); +} diff --git a/reader/src/formats/oeb/OEBBookReader.h b/reader/src/formats/oeb/OEBBookReader.h new file mode 100644 index 0000000..092f269 --- /dev/null +++ b/reader/src/formats/oeb/OEBBookReader.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OEBBOOKREADER_H__ +#define __OEBBOOKREADER_H__ + +#include <map> +#include <vector> +#include <string> + +#include <ZLXMLReader.h> + +#include "../../bookmodel/BookReader.h" + +class XHTMLReader; + +class OEBBookReader : public ZLXMLReader { + +public: + OEBBookReader(BookModel &model); + bool readBook(const ZLFile &file); + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + bool processNamespaces() const; + bool isOPFTag(const std::string &expected, const std::string &tag) const; + const std::vector<std::string> &externalDTDs() const; + + void generateTOC(const XHTMLReader &xhtmlReader); + +private: + enum ReaderState { + READ_NONE, + READ_MANIFEST, + READ_SPINE, + READ_GUIDE, + READ_TOUR + }; + + BookReader myModelReader; + ReaderState myState; + + std::string myFilePrefix; + std::map<std::string,std::string> myIdToHref; + std::map<std::string,std::string> myHrefToMediatype; + std::vector<std::string> myHtmlFileNames; + std::string myNCXTOCFileName; + std::string myCoverFileName; + std::vector<std::pair<std::string,std::string> > myTourTOC; + std::vector<std::pair<std::string,std::string> > myGuideTOC; +}; + +#endif /* __OEBBOOKREADER_H__ */ diff --git a/reader/src/formats/oeb/OEBCoverReader.cpp b/reader/src/formats/oeb/OEBCoverReader.cpp new file mode 100644 index 0000000..842de30 --- /dev/null +++ b/reader/src/formats/oeb/OEBCoverReader.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2009-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLXMLNamespace.h> + +#include "OEBCoverReader.h" +#include "XHTMLImageFinder.h" + +#include "../util/MiscUtil.h" + +OEBCoverReader::OEBCoverReader() { +} + +shared_ptr<const ZLImage> OEBCoverReader::readCover(const ZLFile &file) { + myPathPrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + myReadState = READ_NOTHING; + myImage.reset(); + myCoverXHTML.erase(); + readDocument(file); + if (myImage.isNull() && !myCoverXHTML.empty()) { + const ZLFile coverFile(myCoverXHTML); + const std::string ext = coverFile.extension(); + if (ext == "gif" || ext == "jpeg" || ext == "jpg") { + myImage = new ZLFileImage(coverFile, 0); + } else { + myImage = XHTMLImageFinder().readImage(coverFile); + } + } + return myImage; +} + +static const std::string METADATA = "metadata"; +static const std::string META = "meta"; +static const std::string MANIFEST = "manifest"; +static const std::string ITEM = "item"; +static const std::string GUIDE = "guide"; +static const std::string REFERENCE = "reference"; +static const std::string COVER = "cover"; +static const std::string COVER_IMAGE = "other.ms-coverimage-standard"; + +bool OEBCoverReader::processNamespaces() const { + return true; +} + +void OEBCoverReader::startElementHandler(const char *tag, const char **attributes) { + switch (myReadState) { + case READ_NOTHING: + if (GUIDE == tag) { + myReadState = READ_GUIDE; + } else if (MANIFEST == tag && !myCoverId.empty()) { + myReadState = READ_MANIFEST; + } else if (testTag(ZLXMLNamespace::OpenPackagingFormat, METADATA, tag)) { + myReadState = READ_METADATA; + } + break; + case READ_GUIDE: + if (REFERENCE == tag) { + const char *type = attributeValue(attributes, "type"); + if (type != 0) { + if (COVER == type) { + const char *href = attributeValue(attributes, "href"); + if (href != 0) { + myCoverXHTML = myPathPrefix + MiscUtil::decodeHtmlURL(href); + interrupt(); + } + } else if (COVER_IMAGE == type) { + createImage(attributeValue(attributes, "href")); + } + } + } + break; + case READ_METADATA: + if (testTag(ZLXMLNamespace::OpenPackagingFormat, META, tag)) { + const char *name = attributeValue(attributes, "name"); + if (name != 0 && COVER == name) { + myCoverId = attributeValue(attributes, "content"); + } + } + break; + case READ_MANIFEST: + if (ITEM == tag) { + const char *id = attributeValue(attributes, "id"); + if (id != 0 && myCoverId == id) { + createImage(attributeValue(attributes, "href")); + } + } + break; + } +} + +void OEBCoverReader::createImage(const char *href) { + if (href != 0) { + myImage = new ZLFileImage(ZLFile(myPathPrefix + MiscUtil::decodeHtmlURL(href)), 0); + interrupt(); + } +} + +void OEBCoverReader::endElementHandler(const char *tag) { + switch (myReadState) { + case READ_NOTHING: + break; + case READ_GUIDE: + if (GUIDE == tag) { + myReadState = READ_NOTHING; + } + break; + case READ_METADATA: + if (testTag(ZLXMLNamespace::OpenPackagingFormat, METADATA, tag)) { + myReadState = READ_NOTHING; + } + break; + case READ_MANIFEST: + if (MANIFEST == tag) { + myReadState = READ_NOTHING; + } + break; + } +} diff --git a/reader/src/formats/oeb/OEBCoverReader.h b/reader/src/formats/oeb/OEBCoverReader.h new file mode 100644 index 0000000..e1f96b5 --- /dev/null +++ b/reader/src/formats/oeb/OEBCoverReader.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2009-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OEBCOVERREADER_H__ +#define __OEBCOVERREADER_H__ + +#include <vector> + +#include <shared_ptr.h> +#include <ZLXMLReader.h> + +class ZLImage; + +class OEBCoverReader : public ZLXMLReader { + +public: + OEBCoverReader(); + shared_ptr<const ZLImage> readCover(const ZLFile &file); + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + bool processNamespaces() const; + + void createImage(const char *href); + +private: + shared_ptr<const ZLImage> myImage; + std::string myPathPrefix; + std::string myCoverXHTML; + std::string myCoverId; + enum { + READ_NOTHING, + READ_METADATA, + READ_MANIFEST, + READ_GUIDE + } myReadState; +}; + +#endif /* __OEBCOVERREADER_H__ */ diff --git a/reader/src/formats/oeb/OEBMetaInfoReader.cpp b/reader/src/formats/oeb/OEBMetaInfoReader.cpp new file mode 100644 index 0000000..f9eb82d --- /dev/null +++ b/reader/src/formats/oeb/OEBMetaInfoReader.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> + +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> +#include <ZLLogger.h> +#include <ZLXMLNamespace.h> + +#include "OEBMetaInfoReader.h" +#include "../util/EntityFilesCollector.h" + +#include "../../library/Book.h" + +OEBMetaInfoReader::OEBMetaInfoReader(Book &book) : myBook(book) { + myBook.removeAllAuthors(); + myBook.setTitle(""); + myBook.removeAllTags(); +} + +static const std::string METADATA = "metadata"; +static const std::string DC_METADATA = "dc-metadata"; +static const std::string META = "meta"; +static const std::string AUTHOR_ROLE = "aut"; + +void OEBMetaInfoReader::characterDataHandler(const char *text, std::size_t len) { + switch (myReadState) { + case READ_NONE: + case READ_METADATA: + break; + case READ_AUTHOR: + case READ_AUTHOR2: + case READ_SUBJECT: + case READ_LANGUAGE: + case READ_TITLE: + myBuffer.append(text, len); + break; + } +} + +bool OEBMetaInfoReader::testDCTag(const std::string &name, const std::string &tag) const { + return + testTag(ZLXMLNamespace::DublinCore, name, tag) || + testTag(ZLXMLNamespace::DublinCoreLegacy, name, tag); +} + +bool OEBMetaInfoReader::isNSName(const std::string &fullName, const std::string &shortName, const std::string &fullNSId) const { + const int prefixLength = fullName.length() - shortName.length() - 1; + if (prefixLength <= 0 || + fullName[prefixLength] != ':' || + !ZLStringUtil::stringEndsWith(fullName, shortName)) { + return false; + } + const std::map<std::string,std::string> &namespaceMap = namespaces(); + std::map<std::string,std::string>::const_iterator iter = + namespaceMap.find(fullName.substr(0, prefixLength)); + return iter != namespaceMap.end() && iter->second == fullNSId; +} + +void OEBMetaInfoReader::startElementHandler(const char *tag, const char **attributes) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + switch (myReadState) { + default: + break; + case READ_NONE: + if (testTag(ZLXMLNamespace::OpenPackagingFormat, METADATA, tagString) || + DC_METADATA == tagString) { + myReadState = READ_METADATA; + } + break; + case READ_METADATA: + if (testDCTag("title", tagString)) { + myReadState = READ_TITLE; + } else if (testDCTag("creator", tagString)) { + const char *role = attributeValue(attributes, "role"); + if (role == 0) { + myReadState = READ_AUTHOR2; + } else if (AUTHOR_ROLE == role) { + myReadState = READ_AUTHOR; + } + } else if (testDCTag("subject", tagString)) { + myReadState = READ_SUBJECT; + } else if (testDCTag("language", tagString)) { + myReadState = READ_LANGUAGE; + } else if (testTag(ZLXMLNamespace::OpenPackagingFormat, META, tagString)) { + const char *name = attributeValue(attributes, "name"); + const char *content = attributeValue(attributes, "content"); + if (name != 0 && content != 0) { + std::string sName = name; + if (sName == "calibre:series" || isNSName(sName, "series", ZLXMLNamespace::CalibreMetadata)) { + myBook.setSeries(content, myBook.indexInSeries()); + } else if (sName == "calibre:series_index" || isNSName(sName, "series_index", ZLXMLNamespace::CalibreMetadata)) { + myBook.setSeries(myBook.seriesTitle(), std::string(content)); + } + } + } + break; + } +} + +void OEBMetaInfoReader::endElementHandler(const char *tag) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + ZLUnicodeUtil::utf8Trim(myBuffer); + switch (myReadState) { + case READ_NONE: + break; + case READ_METADATA: + if (testTag(ZLXMLNamespace::OpenPackagingFormat, METADATA, tagString) || DC_METADATA == tagString) { + interrupt(); + myReadState = READ_NONE; + return; + } + break; + case READ_AUTHOR: + if (!myBuffer.empty()) { + myAuthorList.push_back(myBuffer); + } + break; + case READ_AUTHOR2: + if (!myBuffer.empty()) { + myAuthorList2.push_back(myBuffer); + } + break; + case READ_SUBJECT: + if (!myBuffer.empty()) { + myBook.addTag(myBuffer); + } + break; + case READ_TITLE: + if (!myBuffer.empty()) { + myBook.setTitle(myBuffer); + } + break; + case READ_LANGUAGE: + if (!myBuffer.empty()) { + int index = myBuffer.find('-'); + if (index >= 0) { + myBuffer = myBuffer.substr(0, index); + } + index = myBuffer.find('_'); + if (index >= 0) { + myBuffer = myBuffer.substr(0, index); + } + myBook.setLanguage(myBuffer); + } + break; + } + myBuffer.erase(); + myReadState = READ_METADATA; +} + +bool OEBMetaInfoReader::processNamespaces() const { + return true; +} + +bool OEBMetaInfoReader::readMetaInfo(const ZLFile &file) { + myReadState = READ_NONE; + if (!readDocument(file)) { + ZLLogger::Instance().println("epub", "Failure while reading info from " + file.path()); + return false; + } + + if (!myAuthorList.empty()) { + for (std::vector<std::string>::const_iterator it = myAuthorList.begin(); it != myAuthorList.end(); ++it) { + myBook.addAuthor(*it); + } + } else { + for (std::vector<std::string>::const_iterator it = myAuthorList2.begin(); it != myAuthorList2.end(); ++it) { + myBook.addAuthor(*it); + } + } + return true; +} + +const std::vector<std::string> &OEBMetaInfoReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("xhtml"); +} diff --git a/reader/src/formats/oeb/OEBMetaInfoReader.h b/reader/src/formats/oeb/OEBMetaInfoReader.h new file mode 100644 index 0000000..2337c50 --- /dev/null +++ b/reader/src/formats/oeb/OEBMetaInfoReader.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OEBMETAINFOREADER_H__ +#define __OEBMETAINFOREADER_H__ + +#include <vector> + +#include <ZLXMLReader.h> + +class Book; + +class OEBMetaInfoReader : public ZLXMLReader { + +public: + OEBMetaInfoReader(Book &book); + bool readMetaInfo(const ZLFile &file); + + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + void characterDataHandler(const char *text, std::size_t len); + bool processNamespaces() const; + const std::vector<std::string> &externalDTDs() const; + +private: + bool testDCTag(const std::string &name, const std::string &tag) const; + bool isNSName(const std::string &fullName, const std::string &shortName, const std::string &fullNSId) const; + +private: + Book &myBook; + + enum { + READ_NONE, + READ_METADATA, + READ_AUTHOR, + READ_AUTHOR2, + READ_TITLE, + READ_SUBJECT, + READ_LANGUAGE, + } myReadState; + + std::string myBuffer; + std::vector<std::string> myAuthorList; + std::vector<std::string> myAuthorList2; +}; + +#endif /* __OEBMETAINFOREADER_H__ */ diff --git a/reader/src/formats/oeb/OEBPlugin.cpp b/reader/src/formats/oeb/OEBPlugin.cpp new file mode 100644 index 0000000..96970c1 --- /dev/null +++ b/reader/src/formats/oeb/OEBPlugin.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLImage.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> +#include <ZLDir.h> +#include <ZLInputStream.h> +#include <ZLLogger.h> +#include <ZLMimeType.h> + +#include "OEBPlugin.h" +#include "OEBMetaInfoReader.h" +#include "OEBBookReader.h" +#include "OEBCoverReader.h" +#include "OEBTextStream.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +static const std::string OPF = "opf"; +static const std::string OEBZIP = "oebzip"; +static const std::string EPUB = "epub"; + +class ContainerFileReader : public ZLXMLReader { + +public: + const std::string &rootPath() const; + +private: + void startElementHandler(const char *tag, const char **attributes); + +private: + std::string myRootPath; +}; + +const std::string &ContainerFileReader::rootPath() const { + return myRootPath; +} + +void ContainerFileReader::startElementHandler(const char *tag, const char **attributes) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (tagString == "rootfile") { + const char *path = attributeValue(attributes, "full-path"); + if (path != 0) { + myRootPath = path; + interrupt(); + } + } +} + +OEBPlugin::~OEBPlugin() { +} + +bool OEBPlugin::providesMetaInfo() const { + return true; +} + +bool OEBPlugin::acceptsFile(const ZLFile &file) const { + shared_ptr<ZLMimeType> mimeType = file.mimeType(); + const std::string &extension = file.extension(); + if (!mimeType.isNull() && mimeType != ZLMimeType::EMPTY) { + return + mimeType == ZLMimeType::APPLICATION_EPUB_ZIP || + (mimeType == ZLMimeType::APPLICATION_XML && extension == OPF) || + (mimeType == ZLMimeType::APPLICATION_ZIP && extension == OEBZIP); + } + return extension == OPF || extension == OEBZIP || extension == EPUB; +} + +ZLFile OEBPlugin::opfFile(const ZLFile &oebFile) { + //ZLLogger::Instance().registerClass("epub"); + + if (oebFile.extension() == OPF) { + return oebFile; + } + + ZLLogger::Instance().println("epub", "Looking for opf file in " + oebFile.path()); + + shared_ptr<ZLDir> oebDir = oebFile.directory(); + if (!oebDir.isNull()) { + const ZLFile containerInfoFile(oebDir->itemPath("META-INF/container.xml")); + if (containerInfoFile.exists()) { + ZLLogger::Instance().println("epub", "Found container file " + containerInfoFile.path()); + ContainerFileReader reader; + reader.readDocument(containerInfoFile); + const std::string &opfPath = reader.rootPath(); + ZLLogger::Instance().println("epub", "opf path = " + opfPath); + if (!opfPath.empty()) { + return ZLFile(oebDir->itemPath(opfPath)); + } + } + } + + oebFile.forceArchiveType(ZLFile::ZIP); + shared_ptr<ZLDir> zipDir = oebFile.directory(false); + if (zipDir.isNull()) { + ZLLogger::Instance().println("epub", "Couldn't open zip archive"); + return ZLFile::NO_FILE; + } + std::vector<std::string> fileNames; + zipDir->collectFiles(fileNames, false); + for (std::vector<std::string>::const_iterator it = fileNames.begin(); it != fileNames.end(); ++it) { + ZLLogger::Instance().println("epub", "Item: " + *it); + if (ZLStringUtil::stringEndsWith(*it, ".opf")) { + return ZLFile(zipDir->itemPath(*it)); + } + } + ZLLogger::Instance().println("epub", "Opf file not found"); + return ZLFile::NO_FILE; +} + +bool OEBPlugin::readMetaInfo(Book &book) const { + const ZLFile &file = book.file(); + return OEBMetaInfoReader(book).readMetaInfo(opfFile(file)); +} + +bool OEBPlugin::readModel(BookModel &model) const { + const ZLFile &file = model.book()->file(); + return OEBBookReader(model).readBook(opfFile(file)); +} + +shared_ptr<const ZLImage> OEBPlugin::coverImage(const ZLFile &file) const { + return OEBCoverReader().readCover(opfFile(file)); +} + +bool OEBPlugin::readLanguageAndEncoding(Book &book) const { + if (book.language().empty()) { + shared_ptr<ZLInputStream> oebStream = new OEBTextStream(opfFile(book.file())); + detectLanguage(book, *oebStream, book.encoding()); + } + return true; +} diff --git a/reader/src/formats/oeb/OEBPlugin.h b/reader/src/formats/oeb/OEBPlugin.h new file mode 100644 index 0000000..a515208 --- /dev/null +++ b/reader/src/formats/oeb/OEBPlugin.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OEBPLUGIN_H__ +#define __OEBPLUGIN_H__ + +#include "../FormatPlugin.h" + +class OEBPlugin : public FormatPlugin { + +public: + static ZLFile opfFile(const ZLFile &oebFile); + +public: + ~OEBPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + shared_ptr<const ZLImage> coverImage(const ZLFile &file) const; +}; + +#endif /* __OEBPLUGIN_H__ */ diff --git a/reader/src/formats/oeb/OEBTextStream.cpp b/reader/src/formats/oeb/OEBTextStream.cpp new file mode 100644 index 0000000..4dbfa47 --- /dev/null +++ b/reader/src/formats/oeb/OEBTextStream.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <map> + +#include <ZLFile.h> +#include <ZLXMLReader.h> +#include <ZLUnicodeUtil.h> + +#include "OEBTextStream.h" +#include "../util/MiscUtil.h" +#include "../util/XMLTextStream.h" + +class XHTMLFilesCollector : public ZLXMLReader { + +public: + XHTMLFilesCollector(std::vector<std::string> &xhtmlFileNames); + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + +private: + std::vector<std::string> &myXHTMLFileNames; + std::map<std::string,std::string> myIdToHref; + enum { + READ_NONE, + READ_MANIFEST, + READ_SPINE + } myState; +}; + +XHTMLFilesCollector::XHTMLFilesCollector(std::vector<std::string> &xhtmlFileNames) : myXHTMLFileNames(xhtmlFileNames), myState(READ_NONE) { +} + +static const std::string MANIFEST = "manifest"; +static const std::string SPINE = "spine"; +static const std::string ITEM = "item"; +static const std::string ITEMREF = "itemref"; + +void XHTMLFilesCollector::startElementHandler(const char *tag, const char **xmlattributes) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (MANIFEST == tagString) { + myState = READ_MANIFEST; + } else if (SPINE == tagString) { + myState = READ_SPINE; + } else if ((myState == READ_MANIFEST) && (ITEM == tagString)) { + const char *id = attributeValue(xmlattributes, "id"); + const char *href = attributeValue(xmlattributes, "href"); + if ((id != 0) && (href != 0)) { + myIdToHref[id] = href; + } + } else if ((myState == READ_SPINE) && (ITEMREF == tagString)) { + const char *id = attributeValue(xmlattributes, "idref"); + if (id != 0) { + const std::string &fileName = myIdToHref[id]; + if (!fileName.empty()) { + myXHTMLFileNames.push_back(fileName); + } + } + } +} + +void XHTMLFilesCollector::endElementHandler(const char *tag) { + if (SPINE == ZLUnicodeUtil::toLower(tag)) { + interrupt(); + } +} + +OEBTextStream::OEBTextStream(const ZLFile &opfFile) { + myFilePrefix = MiscUtil::htmlDirectoryPrefix(opfFile.path()); + XHTMLFilesCollector(myXHTMLFileNames).readDocument(opfFile); +} + +void OEBTextStream::resetToStart() { + myIndex = 0; +} + +shared_ptr<ZLInputStream> OEBTextStream::nextStream() { + if (myIndex >= myXHTMLFileNames.size()) { + return 0; + } + ZLFile xhtmlFile(myFilePrefix + myXHTMLFileNames[myIndex++]); + return new XMLTextStream(xhtmlFile.inputStream(), "body"); +} diff --git a/reader/src/formats/oeb/OEBTextStream.h b/reader/src/formats/oeb/OEBTextStream.h new file mode 100644 index 0000000..6ddd2c9 --- /dev/null +++ b/reader/src/formats/oeb/OEBTextStream.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OEBTEXTSTREAM_H__ +#define __OEBTEXTSTREAM_H__ + +#include <vector> +#include <string> + +#include "../util/MergedStream.h" + +class OEBTextStream : public MergedStream { + +public: + OEBTextStream(const ZLFile &opfFile); + +private: + void resetToStart(); + shared_ptr<ZLInputStream> nextStream(); + +private: + std::string myFilePrefix; + std::vector<std::string> myXHTMLFileNames; + std::size_t myIndex; +}; + +#endif /* __OEBTEXTSTREAM_H__ */ diff --git a/reader/src/formats/oeb/XHTMLImageFinder.cpp b/reader/src/formats/oeb/XHTMLImageFinder.cpp new file mode 100644 index 0000000..6a449c9 --- /dev/null +++ b/reader/src/formats/oeb/XHTMLImageFinder.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2009-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLXMLNamespace.h> + +#include "XHTMLImageFinder.h" +#include "../util/MiscUtil.h" + +static const std::string TAG_IMG = "img"; +static const std::string TAG_IMAGE = "image"; + +shared_ptr<const ZLImage> XHTMLImageFinder::readImage(const ZLFile &file) { + myImage.reset(); + myPathPrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + readDocument(file); + return myImage; +} + +bool XHTMLImageFinder::processNamespaces() const { + return true; +} + +void XHTMLImageFinder::startElementHandler(const char *tag, const char **attributes) { + const char *reference = 0; + if (TAG_IMG == tag) { + reference = attributeValue(attributes, "src"); + } else if (TAG_IMAGE == tag) { + reference = attributeValue( + attributes, NamespaceAttributeNamePredicate(ZLXMLNamespace::XLink, "href") + ); + } + if (reference != 0) { + myImage = new ZLFileImage(ZLFile(myPathPrefix + reference), 0); + interrupt(); + } +} diff --git a/reader/src/formats/oeb/XHTMLImageFinder.h b/reader/src/formats/oeb/XHTMLImageFinder.h new file mode 100644 index 0000000..28e53f2 --- /dev/null +++ b/reader/src/formats/oeb/XHTMLImageFinder.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2009-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __XHTMLIMAGEFINDER_H__ +#define __XHTMLIMAGEFINDER_H__ + +#include <shared_ptr.h> +#include <ZLXMLReader.h> + +class ZLFile; +class ZLImage; + +class XHTMLImageFinder : public ZLXMLReader { + +public: + shared_ptr<const ZLImage> readImage(const ZLFile &file); + +private: + bool processNamespaces() const; + void startElementHandler(const char *tag, const char **attributes); + +private: + std::string myPathPrefix; + shared_ptr<const ZLImage> myImage; +}; + +#endif /* __XHTMLIMAGEFINDER_H__ */ diff --git a/reader/src/formats/openreader/ORBookReader.cpp b/reader/src/formats/openreader/ORBookReader.cpp new file mode 100644 index 0000000..d494b7f --- /dev/null +++ b/reader/src/formats/openreader/ORBookReader.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cstdlib> +#include <algorithm> + +#include <ZLUnicodeUtil.h> +#include <ZLFileImage.h> + +#include "ORBookReader.h" +#include "../xhtml/XHTMLReader.h" +#include "../util/MiscUtil.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +ORBookReader::ORBookReader(BookModel &model) : myModelReader(model) { +} + +void ORBookReader::characterDataHandler(const char *data, std::size_t len) { + if (myState == READ_TOCTITLE) { + myTOCTitle.append(data, len); + } +} + +static const std::string TAG_RESOURCES = "resources"; +static const std::string TAG_USERSET = "userset"; +static const std::string TAG_NAVIGATION = "primarynav"; + +static const std::string TAG_SPINE = "spine"; +static const std::string TAG_COVER = "cover"; + +static const std::string TAG_ITEM = "item"; +static const std::string TAG_ITEMREF = "itemref"; +static const std::string TAG_POINTER = "pointer"; +static const std::string TAG_TITLE = "title"; + +static const std::string xhtmlMediaType = "application/x-orp-bcd1+xml"; + +void ORBookReader::startElementHandler(const char *tag, const char **xmlattributes) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (TAG_RESOURCES == tagString) { + myState = READ_RESOURCES; + } else if (TAG_USERSET == tagString) { + myState = READ_USERSET; + } else if ((myState == READ_RESOURCES) && (TAG_ITEM == tagString)) { + const char *resid = attributeValue(xmlattributes, "resid"); + const char *resource = attributeValue(xmlattributes, "resource"); + shared_ptr<ZLMimeType> mediaType = ZLMimeType::get(attributeValue(xmlattributes, "media-type")); + if ((resid != 0) && (resource != 0)) { + myResources[resid] = resource; + if (!mediaType.isNull() && mediaType != ZLMimeType::EMPTY) { + if (ZLMimeType::APPLICATION_OR_XML == mediaType) { + myHtmlFileIDs.insert(resid); + } else if (ZLMimeType::isImage(mediaType)) { + myImageIDs[resid] = mediaType; + } + } + } + } else if (myState == READ_USERSET) { + if (TAG_NAVIGATION == tagString) { + myState = READ_NAVIGATION; + } else if (TAG_SPINE == tagString) { + const char *residrefs = attributeValue(xmlattributes, "residrefs"); + if (residrefs != 0) { + while (1) { + const char *nextSpace = std::strchr(residrefs, ' '); + if (nextSpace == 0) { + if (*residrefs != '\0') { + myHtmlFilesOrder.push_back(residrefs); + } + break; + } + if (nextSpace != residrefs) { + myHtmlFilesOrder.push_back(std::string(residrefs, nextSpace - residrefs)); + } + residrefs = nextSpace + 1; + } + } + } else if (TAG_COVER == tagString) { + const char *residrefs = attributeValue(xmlattributes, "residrefs"); + if (residrefs != 0) { + myCoverReference = residrefs; + } + } + } else if (myState == READ_NAVIGATION && TAG_POINTER == tagString) { + const char *ref = attributeValue(xmlattributes, "elemrefs"); + const char *level = attributeValue(xmlattributes, "level"); + if (ref != 0 && level != 0) { + myTOCReference = ref; + myTOCLevel = std::atoi(level); + myState = READ_POINTER; + } + } else if (myState == READ_POINTER && TAG_TITLE == tagString) { + myState = READ_TOCTITLE; + } +} + +void ORBookReader::endElementHandler(const char *tag) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (TAG_RESOURCES == tagString || TAG_USERSET == tagString) { + myState = READ_NONE; + } else if (myState == READ_NAVIGATION && TAG_NAVIGATION == tagString) { + myState = READ_USERSET; + } else if (myState == READ_POINTER && TAG_POINTER == tagString) { + myState = READ_NAVIGATION; + } else if (myState == READ_TOCTITLE && TAG_TITLE == tagString) { + myTOC.push_back(TOCItem(myTOCReference, myTOCTitle, myTOCLevel)); + myTOCTitle.erase(); + myState = READ_POINTER; + } +} + +bool ORBookReader::readBook() { + const ZLFile &file = myModelReader.model().book()->file(); + myFilePrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + + myResources.clear(); + myCoverReference.erase(); + myHtmlFileIDs.clear(); + myImageIDs.clear(); + myHtmlFilesOrder.clear(); + myTOC.clear(); + myState = READ_NONE; + + if (!readDocument(file)) { + return false; + } + + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + + if (!myCoverReference.empty()) { + myModelReader.addImageReference(myCoverReference); + } + + for (std::vector<std::string>::const_iterator it = myHtmlFilesOrder.begin(); it != myHtmlFilesOrder.end(); ++it) { + myHtmlFileIDs.erase(*it); + XHTMLReader(myModelReader).readFile(ZLFile(myFilePrefix + myResources[*it]), *it); + } + + int level = 1; + for (std::vector<TOCItem>::const_iterator it = myTOC.begin(); it != myTOC.end(); ++it) { + int index = myModelReader.model().label(it->Reference).ParagraphNumber; + if (index != -1) { + for (; level > it->Level; --level) { + myModelReader.endContentsParagraph(); + } + ++level; + myModelReader.beginContentsParagraph(index); + myModelReader.addContentsData(it->Text); + } + } + for (; level > 1; --level) { + myModelReader.endContentsParagraph(); + } + + for (std::set<std::string>::const_iterator it = myHtmlFileIDs.begin(); it != myHtmlFileIDs.end(); ++it) { + myModelReader.setFootnoteTextModel(*it); + myModelReader.pushKind(REGULAR); + XHTMLReader(myModelReader).readFile(ZLFile(myFilePrefix + myResources[*it]), *it); + } + + for (std::map<std::string,shared_ptr<ZLMimeType> >::const_iterator it = myImageIDs.begin(); it != myImageIDs.end(); ++it) { + myModelReader.addImage(it->first, new ZLFileImage(ZLFile(myFilePrefix + myResources[it->first], it->second), 0)); + } + + return true; +} diff --git a/reader/src/formats/openreader/ORBookReader.h b/reader/src/formats/openreader/ORBookReader.h new file mode 100644 index 0000000..160c9f1 --- /dev/null +++ b/reader/src/formats/openreader/ORBookReader.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __ORBOOKREADER_H__ +#define __ORBOOKREADER_H__ + +#include <map> +#include <set> +#include <vector> +#include <string> + +#include <ZLXMLReader.h> + +#include "../../bookmodel/BookReader.h" + +class ORBookReader : public ZLXMLReader { + +public: + ORBookReader(BookModel &model); + bool readBook(); + + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + void characterDataHandler(const char *text, std::size_t len); + +private: + enum ReaderState { + READ_NONE, + READ_RESOURCES, + READ_USERSET, + READ_NAVIGATION, + READ_POINTER, + READ_TOCTITLE + }; + + BookReader myModelReader; + ReaderState myState; + + std::string myFilePrefix; + std::map<std::string,std::string> myResources; + std::string myCoverReference; + std::set<std::string> myHtmlFileIDs; + std::map<std::string,shared_ptr<ZLMimeType> > myImageIDs; + std::vector<std::string> myHtmlFilesOrder; + + struct TOCItem { + TOCItem(const std::string &reference, const std::string &text, int level) : Reference(reference), Text(text), Level(level) { + } + + std::string Reference; + std::string Text; + int Level; + }; + std::vector<TOCItem> myTOC; + + std::string myTOCReference; + int myTOCLevel; + std::string myTOCTitle; +}; + +#endif /* __ORBOOKREADER_H__ */ diff --git a/reader/src/formats/openreader/ORDescriptionReader.cpp b/reader/src/formats/openreader/ORDescriptionReader.cpp new file mode 100644 index 0000000..8c80dfa --- /dev/null +++ b/reader/src/formats/openreader/ORDescriptionReader.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "ORDescriptionReader.h" + +#include "../util/EntityFilesCollector.h" +#include "../../library/Book.h" + +ORDescriptionReader::ORDescriptionReader(Book &book) : myBook(book) { + myBook.removeAllAuthors(); + myBook.setTitle(""); +} + +// TODO: replace "dc" by real DC scheme name +static const std::string METADATA = "metadata"; +static const std::string TITLE = "dc:title"; +static const std::string AUTHOR_TAG = "dc:creator"; +static const std::string AUTHOR_ROLE = "aut"; + +void ORDescriptionReader::characterDataHandler(const char *text, std::size_t len) { + switch (myReadState) { + case READ_NONE: + break; + case READ_AUTHOR: + myCurrentAuthor.append(text, len); + break; + case READ_TITLE: + myBook.setTitle(myBook.title() + std::string(text, len)); + break; + } +} + +void ORDescriptionReader::startElementHandler(const char *tag, const char **attributes) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (METADATA == tagString) { + myReadMetaData = true; + } else if (myReadMetaData) { + if (TITLE == tagString) { + myReadState = READ_TITLE; + } else if (AUTHOR_TAG == tagString) { + const char *role = attributeValue(attributes, "role"); + if ((role != 0) && (AUTHOR_ROLE == role)) { + myReadState = READ_AUTHOR; + } + } + } +} + +void ORDescriptionReader::endElementHandler(const char *tag) { + const std::string tagString = ZLUnicodeUtil::toLower(tag); + if (METADATA == tagString) { + interrupt(); + } else { + if (!myCurrentAuthor.empty()) { + myBook.addAuthor(myCurrentAuthor); + myCurrentAuthor.erase(); + } + myReadState = READ_NONE; + } +} + +bool ORDescriptionReader::readMetaInfo() { + myReadMetaData = false; + myReadState = READ_NONE; + return readDocument(myBook.file()); +} + +const std::vector<std::string> &ORDescriptionReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("xhtml"); +} diff --git a/reader/src/formats/openreader/ORDescriptionReader.h b/reader/src/formats/openreader/ORDescriptionReader.h new file mode 100644 index 0000000..a4f6b2a --- /dev/null +++ b/reader/src/formats/openreader/ORDescriptionReader.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __ORDESCRIPTIONREADER_H__ +#define __ORDESCRIPTIONREADER_H__ + +#include <ZLXMLReader.h> + +class Book; + +class ORDescriptionReader : public ZLXMLReader { + +public: + ORDescriptionReader(Book &book); + bool readMetaInfo(); + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + void characterDataHandler(const char *text, std::size_t len); + + const std::vector<std::string> &externalDTDs() const; + +private: + Book &myBook; + + bool myReadMetaData; + enum { + READ_NONE, + READ_AUTHOR, + READ_TITLE + } myReadState; + + std::string myCurrentAuthor; +}; + +#endif /* __ORDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/openreader/OpenReaderPlugin.cpp b/reader/src/formats/openreader/OpenReaderPlugin.cpp new file mode 100644 index 0000000..545f83b --- /dev/null +++ b/reader/src/formats/openreader/OpenReaderPlugin.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLStringUtil.h> +#include <ZLDir.h> + +#include "OpenReaderPlugin.h" +#include "ORDescriptionReader.h" +#include "ORBookReader.h" + +#include "../../library/Book.h" + +OpenReaderPlugin::~OpenReaderPlugin() { +} + +bool OpenReaderPlugin::providesMetaInfo() const { + return true; +} + +bool OpenReaderPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "orb"; +} + +bool OpenReaderPlugin::readMetaInfo(Book &book) const { + return ORDescriptionReader(book).readMetaInfo(); +} + +bool OpenReaderPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +bool OpenReaderPlugin::readModel(BookModel &model) const { + return ORBookReader(model).readBook(); +} diff --git a/reader/src/formats/openreader/OpenReaderPlugin.h b/reader/src/formats/openreader/OpenReaderPlugin.h new file mode 100644 index 0000000..fcfaa11 --- /dev/null +++ b/reader/src/formats/openreader/OpenReaderPlugin.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __OPENREADERPLUGIN_H__ +#define __OPENREADERPLUGIN_H__ + +#include "../FormatPlugin.h" + +class OpenReaderPlugin : public FormatPlugin { + +public: + ~OpenReaderPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +#endif /* __OPENREADERPLUGIN_H__ */ diff --git a/reader/src/formats/pdb/BitReader.cpp b/reader/src/formats/pdb/BitReader.cpp new file mode 100644 index 0000000..551aaf3 --- /dev/null +++ b/reader/src/formats/pdb/BitReader.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <string> + +#include "BitReader.h" + +BitReader::BitReader(const unsigned char* data, std::size_t size) : myOffset(0), myLength(size * 8) { + myData = new unsigned char[size + 4]; + std::memcpy(myData, data, size); + std::memset(myData + size, 0x00, 4); +} + +BitReader::~BitReader() { + delete[] myData; +} + +unsigned long long BitReader::peek(std::size_t n) { + if (n > 32) { + return 0; + } + unsigned long long r = 0; + std::size_t g = 0; + while (g < n) { + r = (r << 8) | myData[(myOffset + g) >> 3]; + g = g + 8 - ((myOffset+g) & 7); + } + unsigned long long mask = 1; + mask = (mask << n) - 1; + return (r >> (g - n)) & mask; +} + +bool BitReader::eat(std::size_t n) { + myOffset += n; + return myOffset <= myLength; +} + +std::size_t BitReader::left() const { + return myLength - myOffset; +} diff --git a/reader/src/formats/pdb/BitReader.h b/reader/src/formats/pdb/BitReader.h new file mode 100644 index 0000000..a8a3d2d --- /dev/null +++ b/reader/src/formats/pdb/BitReader.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __BITREADER_H__ +#define __BITREADER_H__ + +class BitReader { + +public: + BitReader(const unsigned char* data, std::size_t size); + ~BitReader(); + + unsigned long long peek(std::size_t n); + bool eat(std::size_t n); + std::size_t left() const; + +private: + unsigned char* myData; + std::size_t myOffset; + std::size_t myLength; +}; + +#endif //__BITREADER_H__ diff --git a/reader/src/formats/pdb/DocDecompressor.cpp b/reader/src/formats/pdb/DocDecompressor.cpp new file mode 100644 index 0000000..9175bc9 --- /dev/null +++ b/reader/src/formats/pdb/DocDecompressor.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLInputStream.h> + +#include "DocDecompressor.h" + +static unsigned char TOKEN_CODE[256] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; + +std::size_t DocDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, std::size_t compressedSize, std::size_t maxUncompressedSize) { + const unsigned char *sourceBuffer = new unsigned char[compressedSize]; + const unsigned char *sourceBufferEnd = sourceBuffer + compressedSize; + const unsigned char *sourcePtr = sourceBuffer; + + unsigned char *targetBufferEnd = (unsigned char*)targetBuffer + maxUncompressedSize; + unsigned char *targetPtr = (unsigned char*)targetBuffer; + + if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) { + unsigned char token; + unsigned short copyLength, N, shift; + unsigned char *shifted; + + while ((sourcePtr < sourceBufferEnd) && (targetPtr < targetBufferEnd)) { + token = *(sourcePtr++); + switch (TOKEN_CODE[token]) { + case 0: + *(targetPtr++) = token; + break; + case 1: + if ((sourcePtr + token > sourceBufferEnd) || (targetPtr + token > targetBufferEnd)) { + goto endOfLoop; + } + std::memcpy(targetPtr, sourcePtr, token); + sourcePtr += token; + targetPtr += token; + break; + case 2: + if (targetPtr + 2 > targetBufferEnd) { + goto endOfLoop; + } + *(targetPtr++) = ' '; + *(targetPtr++) = token ^ 0x80; + break; + case 3: + if (sourcePtr + 1 > sourceBufferEnd) { + goto endOfLoop; + } + N = 256 * token + *(sourcePtr++); + copyLength = (N & 7) + 3; + if (targetPtr + copyLength > targetBufferEnd) { + goto endOfLoop; + } + shift = (N & 0x3fff) / 8; + shifted = targetPtr - shift; + if ((char*)shifted >= targetBuffer) { + for (short i = 0; i < copyLength; i++) { + *(targetPtr++) = *(shifted++); + } + } + break; + } + } + } +endOfLoop: + + delete[] sourceBuffer; + return targetPtr - (unsigned char*)targetBuffer; +} diff --git a/reader/src/formats/pdb/DocDecompressor.h b/reader/src/formats/pdb/DocDecompressor.h new file mode 100644 index 0000000..820bb0a --- /dev/null +++ b/reader/src/formats/pdb/DocDecompressor.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __DOCDECOMPRESSOR_H__ +#define __DOCDECOMPRESSOR_H__ + +#include <string> + +class ZLInputStream; + +class DocDecompressor { + +public: + DocDecompressor() {} + ~DocDecompressor() {} + + std::size_t decompress(ZLInputStream &stream, char *buffer, std::size_t compressedSize, std::size_t maxUncompressedSize); +}; + +#endif /* __DOCDECOMPRESSOR_H__ */ diff --git a/reader/src/formats/pdb/EReaderPlugin.cpp b/reader/src/formats/pdb/EReaderPlugin.cpp new file mode 100644 index 0000000..8420c7f --- /dev/null +++ b/reader/src/formats/pdb/EReaderPlugin.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLEncodingConverter.h> +#include <ZLStringUtil.h> +#include <ZLLanguageUtil.h> +#include <ZLFileImage.h> + +#include "PdbPlugin.h" +#include "EReaderStream.h" +#include "PmlBookReader.h" + +#include "../../library/Book.h" + +bool EReaderPlugin::providesMetaInfo() const { + return true; +} + +bool EReaderPlugin::acceptsFile(const ZLFile &file) const { + return PdbPlugin::fileType(file) == "PNRdPPrs"; +} + +void EReaderPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { + if (!stream.open()) { + //TODO maybe anything else opens stream + return; + } + BookReader bookReader(model); + PmlBookReader pmlBookReader(bookReader, format, encoding); + bookReader.setMainTextModel(); + pmlBookReader.readDocument(stream); + EReaderStream &estream = (EReaderStream&)stream; + const std::map<std::string, EReaderStream::ImageInfo>& imageIds = estream.images(); + for(std::map<std::string, EReaderStream::ImageInfo>::const_iterator it = imageIds.begin(); it != imageIds.end(); ++it) { + const std::string id = it->first; + bookReader.addImage(id, new ZLFileImage(ZLFile(file.path(), it->second.Type), it->second.Offset, it->second.Size)); + } + const std::map<std::string, unsigned short>& footnoteIds = estream.footnotes(); + for(std::map<std::string, unsigned short>::const_iterator it = footnoteIds.begin(); it != footnoteIds.end(); ++it) { + const std::string id = it->first; + if (estream.switchStreamDestination(EReaderStream::FOOTNOTE, id)) { + bookReader.setFootnoteTextModel(id); + bookReader.addHyperlinkLabel(id); + pmlBookReader.readDocument(estream); + } + } + stream.close(); +} + +shared_ptr<ZLInputStream> EReaderPlugin::createStream(const ZLFile &file) const { + return new EReaderStream(file); +} + +const std::string &EReaderPlugin::tryOpen(const ZLFile &file) const { + EReaderStream stream(file); + stream.open(); + return stream.error(); +} + +bool EReaderPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = book.file().inputStream(); + if (stream.isNull() || ! stream->open()) { + return false; + } + PdbHeader header; + if (!header.read(stream)) { + return false; + } + stream->seek(header.Offsets[0] + 46, true); + unsigned short metaInfoOffset; + PdbUtil::readUnsignedShort(*stream, metaInfoOffset); + if (metaInfoOffset == 0 || metaInfoOffset >= header.Offsets.size()) { + return false; + } + std::size_t currentOffset = header.Offsets[metaInfoOffset]; + std::size_t nextOffset = + (metaInfoOffset + 1 < (unsigned short)header.Offsets.size()) ? + header.Offsets[metaInfoOffset + 1] : stream->sizeOfOpened(); + if (nextOffset <= currentOffset) { + return false; + } + std::size_t length = nextOffset - currentOffset; + + char* metaInfoBuffer = new char[length]; + stream->seek(currentOffset, true); + stream->read(metaInfoBuffer, length); + std::string metaInfoStr(metaInfoBuffer, length); + delete[] metaInfoBuffer; + + std::string metaInfoData[5]; // Title; Author; Rights; Publisher; isbn; + for (std::size_t i = 0; i < 5; ++i) { + const std::size_t index = metaInfoStr.find('\0'); + metaInfoData[i] = metaInfoStr.substr(0,index); + metaInfoStr = metaInfoStr.substr(index + 1); + } + + if (!metaInfoData[0].empty()) { + book.setTitle(metaInfoData[0]); + } + + if (!metaInfoData[1].empty()) { + book.addAuthor(metaInfoData[1]); + } + + stream->close(); + return SimplePdbPlugin::readMetaInfo(book); +} diff --git a/reader/src/formats/pdb/EReaderStream.cpp b/reader/src/formats/pdb/EReaderStream.cpp new file mode 100644 index 0000000..9775773 --- /dev/null +++ b/reader/src/formats/pdb/EReaderStream.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> +#include <cctype> + +#include <ZLFile.h> +#include <ZLResource.h> +#include <ZLZDecompressor.h> + +#include "EReaderStream.h" +#include "DocDecompressor.h" + + +EReaderStream::EReaderStream(const ZLFile &file) : PalmDocLikeStream(file) { + myDestination = TEXT; +} + +EReaderStream::~EReaderStream() { + close(); +} + +bool EReaderStream::switchStreamDestination(StreamDestination destination, const std::string& id) { + bool result = true; + switch(destination) { + case TEXT: + myDestination = TEXT; + myRecordIndex = 1; + break; + case FOOTNOTE: + std::map<std::string, unsigned short>::const_iterator footnoteIt = myFootnotes.find(id); + if (footnoteIt != myFootnotes.end()) { + myDestination = FOOTNOTE; + myRecordIndex = footnoteIt->second; + } else { + result = false; + } + break; + } + return result; +} + +bool EReaderStream::fillBuffer() { + if (myDestination == TEXT) { + return PalmDocLikeStream::fillBuffer(); + } else { + while (myBufferOffset == myBufferLength) { + if (!processRecord()) { + return false; + } + } + return true; + } +} + +bool EReaderStream::processRecord() { + const std::size_t currentOffset = recordOffset(myRecordIndex); + if (currentOffset < myBase->offset()) { + return false; + } + myBase->seek(currentOffset, true); + const std::size_t nextOffset = recordOffset(myRecordIndex + 1); + if (nextOffset < currentOffset) { + return false; + } + + unsigned short myCompressedSize = nextOffset - currentOffset; + + switch (myCompressionVersion) { + case 10: // Inflate compression + myBase->seek(2, false); + myBufferLength = ZLZDecompressor(myCompressedSize - 2).decompress(*myBase, myBuffer, myMaxRecordSize); + break; + case 2: // PalmDoc compression + myBufferLength = DocDecompressor().decompress(*myBase, myBuffer, myCompressedSize, myMaxRecordSize); + break; + } + clearBuffer('\0'); + myBufferOffset = 0; + return true; +} + +bool EReaderStream::processZeroRecord() { + // Use it with offset presetting to zero record offset value + PdbUtil::readUnsignedShort(*myBase, myCompressionVersion); // myBase offset: ^ + 2 + if (myCompressionVersion > 255) { + myErrorCode = ERROR_ENCRYPTION; + return false; + } else { + switch (myCompressionVersion) { + case 2: + case 10: + break; + default: + myErrorCode = ERROR_COMPRESSION; + return false; + } + } + myBase->seek(10, false); // myBase offset: ^ + 12 + PdbUtil::readUnsignedShort(*myBase, myNonTextOffset); // myBase offset: ^ + 14 + PdbUtil::readUnsignedShort(*myBase, myNonTextOffsetReserved); // myBase offset: ^ + 16 + myBase->seek(12, false); // myBase offset: ^ + 28 + PdbUtil::readUnsignedShort(*myBase, myFootnoteRecords); // myBase offset: ^ + 30 + PdbUtil::readUnsignedShort(*myBase, mySidebarRecords); // myBase offset: ^ + 32 + PdbUtil::readUnsignedShort(*myBase, myBookmarksOffset); // myBase offset: ^ + 34 + myBase->seek(2, false); // myBase offset: ^ + 36 + PdbUtil::readUnsignedShort(*myBase, myNonTextOffsetExtraReserved); // myBase offset: ^ + 38 + myBase->seek(2, false); // myBase offset: ^ + 40 + PdbUtil::readUnsignedShort(*myBase, myImagedataOffset); // myBase offset: ^ + 42 + PdbUtil::readUnsignedShort(*myBase, myImagedataOffsetReserved); // myBase offset: ^ + 44 + PdbUtil::readUnsignedShort(*myBase, myMetadataOffset); // myBase offset: ^ + 46 + PdbUtil::readUnsignedShort(*myBase, myMetadataOffsetReserved); // myBase offset: ^ + 48 + PdbUtil::readUnsignedShort(*myBase, myFootnoteOffset); // myBase offset: ^ + 50 + PdbUtil::readUnsignedShort(*myBase, mySidebarOffset); // myBase offset: ^ + 52 + PdbUtil::readUnsignedShort(*myBase, myLastdataOffset); // myBase offset: ^ + 54 + + unsigned short endSectionIndex = header().Offsets.size(); + myMaxRecordIndex = std::min((unsigned short) (myNonTextOffset - 1), (unsigned short) (endSectionIndex - 1)); + + myMaxRecordSize = 65535; // Maximum size of addressable space in PalmOS + // not more than 8192 bytes happens in the tested examples + + if (myFootnoteRecords) { + bool isSuccess = processFootnoteIdsRecord(); + if (!isSuccess) { + //TODO take in account returned bool value + //false if wrong footnotes amount anounced in zero record + //or corrupted or wrong footnote ids record + } + } + + if (myImagedataOffset != myMetadataOffset) { + bool isSuccess = processImageHeaders(); + if (!isSuccess) { + //TODO take in account returned bool value + //false if one of image record is corrupted + } + } + + myBase->seek(header().Offsets[1], true); + + /* + std::cerr << "EReaderStream::processZeroRecord():\n"; + std::cerr << "PDB header indentificator : " << header().Id << "\n"; + std::cerr << "PDB file system: sizeof opened : " << myBaseSize << "\n"; + std::cerr << "PDB header/record[0] max index : " << myMaxRecordIndex << "\n"; + std::cerr << "PDB record[0][0..2] compression : " << myCompressionVersion << "\n"; + std::cerr << "EReader record[0] myNonTextOffset : " << myNonTextOffset << std::endl; + std::cerr << "EReader record[0] myNonTextOffset2 : " << myNonTextOffsetReserved << std::endl; + std::cerr << "EReader record[0] myFootnoteRecords : " << myFootnoteRecords << std::endl; + std::cerr << "EReader record[0] mySidebarRecords : " << mySidebarRecords << std::endl; + std::cerr << "EReader record[0] myBookmarksOffset : " << myBookmarksOffset << std::endl; + std::cerr << "EReader record[0] myNonTextOffset3 : " << myNonTextOffsetExtraReserved << std::endl; + std::cerr << "EReader record[0] myImagedataOffset : " << myImagedataOffset << std::endl; + std::cerr << "EReader record[0] myImagedataOffset2 : " << myImagedataOffsetReserved << std::endl; + std::cerr << "EReader record[0] myMetadataOffset : " << myMetadataOffset << std::endl; + std::cerr << "EReader record[0] myMetadataOffset2 : " << myMetadataOffsetReserved << std::endl; + std::cerr << "EReader record[0] myFootnoteOffset : " << myFootnoteOffset << std::endl; + std::cerr << "EReader record[0] mySidebarOffset : " << mySidebarOffset << std::endl; + std::cerr << "EReader record[0] myLastdataOffset : " << myLastdataOffset << std::endl; + std::cerr << "PDB header lastSectionIndex : " << endSectionIndex - 1 << "\n"; + */ + return true; +} + +void EReaderStream::clearBuffer(unsigned char symbol) { + myBufferLength = std::remove(myBuffer, myBuffer + myBufferLength, symbol) - myBuffer; +} + +bool EReaderStream::processFootnoteIdsRecord() { + char* footnoteIdBuffer = new char[myMaxRecordSize]; + myBase->seek(header().Offsets[myFootnoteOffset], true); + const std::size_t currentOffset = recordOffset(myFootnoteOffset); + const std::size_t nextOffset = recordOffset(myFootnoteOffset + 1); + const std::size_t length = nextOffset - currentOffset; + myBase->read(footnoteIdBuffer, length); + std::string footnoteIdStr(footnoteIdBuffer, length); + unsigned short footnoteIndex = myFootnoteOffset + 1; + while (!footnoteIdStr.empty() && (footnoteIndex < myLastdataOffset)) { + std::string id = findFootnoteId(footnoteIdStr); + if (!id.empty()) { + myFootnotes[id] = footnoteIndex; + ++footnoteIndex; + } + } + delete[] footnoteIdBuffer; + return (myFootnoteRecords - 1 == (unsigned short)myFootnotes.size()); +} + +std::string EReaderStream::findFootnoteId(std::string &footnoteIdStr) const { + std::string resultStr; + if (!footnoteIdStr.empty()) { + std::size_t counter = 0; + for (; counter < footnoteIdStr.length(); ++counter) { + if (std::isalnum(footnoteIdStr[counter])) { + break; + } + } + const std::size_t startIdIndex = counter; + for (; counter < footnoteIdStr.length(); ++counter) { + if (footnoteIdStr[counter] == '\0') { + break; + } + } + const std::size_t endIdIndex = counter; + resultStr = footnoteIdStr.substr(startIdIndex, endIdIndex - startIdIndex); + footnoteIdStr = footnoteIdStr.substr(endIdIndex); + } + return resultStr; +} + +const std::map<std::string, unsigned short>& EReaderStream::footnotes() const { + return myFootnotes; +} + +bool EReaderStream::processImageHeaders() { + unsigned short recordIndex = myImagedataOffset; + bool result = true; + myBase->seek(header().Offsets[recordIndex], true); + while (recordIndex < myMetadataOffset && recordIndex < myLastdataOffset) { + result = result && addImageInfo(recordIndex); + ++recordIndex; + } + return result; +} + +bool EReaderStream::addImageInfo(const unsigned short recordIndex) { + const std::size_t bufferLength = 128; + char *buffer = new char[bufferLength]; //TODO may be it's needed here more bytes + ImageInfo image; + const std::size_t currentOffset = recordOffset(recordIndex); + const std::size_t nextOffset = recordOffset(recordIndex + 1); + + myBase->read(buffer, bufferLength); + std::string header(buffer, bufferLength); + delete[] buffer; + + image.Offset = currentOffset + header.find("\x89PNG"); //TODO treat situation when there isn't PNG in first 128 bytes + image.Size = nextOffset - image.Offset; + const int endType = header.find(' '); + image.Type = ZLMimeType::get(header.substr(0, endType)); + header = header.substr(endType + 1); + const int endId = header.find('\0'); + const std::string id = header.substr(0, endId); + myBase->seek(nextOffset - currentOffset - bufferLength, false); + if (id.empty()) { + return false; + } + myImages[id] = image; + return true; +} + + +/*bool EReaderStream::hasExtraSections() const { + return false; + //return myMaxRecordIndex < header().Offsets.size() - 1; +}*/ + +EReaderStream::ImageInfo EReaderStream::imageLocation(const std::string& id) { + if (myImagedataOffset != myMetadataOffset && myImages.empty()) { + processImageHeaders(); + } + const std::map<std::string, ImageInfo>::const_iterator it = myImages.find(id); + if (it != myImages.end()) { + return it->second; + } else { + return ImageInfo(); + } +} + +const std::map<std::string, EReaderStream::ImageInfo>& EReaderStream::images() const { + return myImages; +} diff --git a/reader/src/formats/pdb/EReaderStream.h b/reader/src/formats/pdb/EReaderStream.h new file mode 100644 index 0000000..990c6ba --- /dev/null +++ b/reader/src/formats/pdb/EReaderStream.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __EREADERSTREAM_H__ +#define __EREADERSTREAM_H__ + +#include <map> + +#include "PalmDocLikeStream.h" +#include <ZLMimeType.h> + +class ZLFile; + +class EReaderStream : public PalmDocLikeStream { + +public: + EReaderStream(const ZLFile &file); + ~EReaderStream(); + + enum StreamDestination { + TEXT, + FOOTNOTE, + }; + + struct ImageInfo { + unsigned long Offset; + unsigned short Size; + shared_ptr<ZLMimeType> Type; + }; + + ImageInfo imageLocation(const std::string& id); + //bool hasExtraSections() const; + bool switchStreamDestination(StreamDestination destination, const std::string &footnoteId); + const std::map<std::string, unsigned short>& footnotes() const; + const std::map<std::string, ImageInfo>& images() const; + +private: + bool processRecord(); + bool processZeroRecord(); + bool processFootnoteIdsRecord(); + bool processImageHeaders(); + + void clearBuffer(unsigned char symbol); + std::string findFootnoteId(std::string &footnoteIdStr) const; + bool addImageInfo(const unsigned short recordIndex); + + bool fillBuffer(); + +private: + unsigned short myCompressionVersion; + unsigned short myNonTextOffset; + unsigned short myNonTextOffsetReserved; //TODO: Warning: isn't used + unsigned short myFootnoteRecords; + unsigned short mySidebarRecords; + unsigned short myBookmarksOffset; + unsigned short myNonTextOffsetExtraReserved; //TODO: Warning: isn't used + unsigned short myImagedataOffset; + unsigned short myImagedataOffsetReserved; //TODO: Warning: isn't used + unsigned short myMetadataOffset; + unsigned short myMetadataOffsetReserved; //TODO: Warning: isn't used + unsigned short myFootnoteOffset; + unsigned short mySidebarOffset; + unsigned short myLastdataOffset; + + + StreamDestination myDestination; + std::map<std::string, unsigned short> myFootnotes; + std::map<std::string, ImageInfo> myImages; + +}; + +#endif /* __EREADERSTREAM_H__ */ diff --git a/reader/src/formats/pdb/HtmlMetainfoReader.cpp b/reader/src/formats/pdb/HtmlMetainfoReader.cpp new file mode 100644 index 0000000..8829591 --- /dev/null +++ b/reader/src/formats/pdb/HtmlMetainfoReader.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLUnicodeUtil.h> + +#include "HtmlMetainfoReader.h" + +#include "../../library/Book.h" + +HtmlMetainfoReader::HtmlMetainfoReader(Book &book, ReadType readType) : + HtmlReader(book.encoding()), myBook(book), myReadType(readType) { +} + +bool HtmlMetainfoReader::tagHandler(const HtmlReader::HtmlTag &tag) { + if (tag.Name == "BODY") { + return false; + } else if (((myReadType & TAGS) == TAGS) && (tag.Name == "DC:SUBJECT")) { + myReadTags = tag.Start; + if (!tag.Start && !myBuffer.empty()) { + myBook.addTag(myBuffer); + myBuffer.erase(); + } + } else if (((myReadType & TITLE) == TITLE) && (tag.Name == "DC:TITLE")) { + myReadTitle = tag.Start; + if (!tag.Start && !myBuffer.empty()) { + myBook.setTitle(myBuffer); + myBuffer.erase(); + } + } else if (((myReadType & AUTHOR) == AUTHOR) && (tag.Name == "DC:CREATOR")) { + if (tag.Start) { + bool flag = false; + for (std::size_t i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "ROLE") { + flag = ZLUnicodeUtil::toUpper(tag.Attributes[i].Value) == "AUT"; + break; + } + } + if (flag) { + if (!myBuffer.empty()) { + myBuffer += ", "; + } + myReadAuthor = true; + } + } else { + myReadAuthor = false; + if (!myBuffer.empty()) { + myBook.addAuthor(myBuffer); + } + myBuffer.erase(); + } + } + return true; +} + +void HtmlMetainfoReader::startDocumentHandler() { + myReadAuthor = false; + myReadTitle = false; + myReadTags = false; +} + +void HtmlMetainfoReader::endDocumentHandler() { +} + +bool HtmlMetainfoReader::characterDataHandler(const char *text, std::size_t len, bool convert) { + if (myReadTitle || myReadAuthor || myReadTags) { + if (convert) { + myConverter->convert(myBuffer, text, text + len); + } else { + myBuffer.append(text, len); + } + } + return true; +} diff --git a/reader/src/formats/pdb/HtmlMetainfoReader.h b/reader/src/formats/pdb/HtmlMetainfoReader.h new file mode 100644 index 0000000..119c72e --- /dev/null +++ b/reader/src/formats/pdb/HtmlMetainfoReader.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HTMLMETAINFOREADER_H__ +#define __HTMLMETAINFOREADER_H__ + +#include "../html/HtmlReader.h" + +class Book; + +class HtmlMetainfoReader : public HtmlReader { + +public: + enum ReadType { + NONE = 0, + TITLE = 1, + AUTHOR = 2, + TITLE_AND_AUTHOR = TITLE | AUTHOR, + TAGS = 4, + ALL = TITLE | AUTHOR | TAGS + }; + +public: + HtmlMetainfoReader(Book &book, ReadType readType); + +private: + void startDocumentHandler(); + void endDocumentHandler(); + + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + +private: + Book &myBook; + const ReadType myReadType; + + bool myReadTitle; + bool myReadAuthor; + bool myReadTags; + + std::string myBuffer; +}; + +#endif /* __HTMLMETAINFOREADER_H__ */ diff --git a/reader/src/formats/pdb/HuffDecompressor.cpp b/reader/src/formats/pdb/HuffDecompressor.cpp new file mode 100644 index 0000000..9b6f285 --- /dev/null +++ b/reader/src/formats/pdb/HuffDecompressor.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLInputStream.h> + +#include "PdbReader.h" +#include "BitReader.h" +#include "HuffDecompressor.h" + +HuffDecompressor::HuffDecompressor(ZLInputStream& stream, + const std::vector<unsigned long>::const_iterator beginIt, + const std::vector<unsigned long>::const_iterator endIt, + const unsigned long endHuffDataOffset, const unsigned long extraFlags) : myExtraFlags(extraFlags), myErrorCode(ERROR_NONE) { + + + const unsigned long huffHeaderOffset = *beginIt; + const unsigned long huffRecordsNumber = endIt - beginIt; + const unsigned long huffDataOffset = *(beginIt + 1); + + stream.seek(huffHeaderOffset, true); + stream.seek(16, false); + unsigned long cacheTableOffset, baseTableOffset; + PdbUtil::readUnsignedLongBE(stream, cacheTableOffset); + PdbUtil::readUnsignedLongBE(stream, baseTableOffset); + + + myCacheTable = new unsigned long[256]; + stream.seek(huffHeaderOffset + cacheTableOffset, true); + for (std::size_t i = 0; i < 256; ++i) { + PdbUtil::readUnsignedLongLE(stream, myCacheTable[i]); //LE + } + + myBaseTable = new unsigned long[64]; + stream.seek(huffHeaderOffset + baseTableOffset, true); + for (std::size_t i = 0; i < 64; ++i) { + PdbUtil::readUnsignedLongLE(stream, myBaseTable[i]); //LE + } + + stream.seek(huffDataOffset + 12, true); + PdbUtil::readUnsignedLongBE(stream, myEntryBits); + + std::size_t huffDataSize = endHuffDataOffset - huffDataOffset; + myData = new unsigned char[huffDataSize]; + stream.seek(huffDataOffset, true); + if (huffDataSize == stream.read((char*)myData, huffDataSize)) { + myDicts = new unsigned char* [huffRecordsNumber - 1]; + for(std::size_t i = 0; i < huffRecordsNumber - 1; ++i) { + std::size_t shift = *(beginIt + i + 1) - huffDataOffset; + myDicts[i] = myData + shift; + } + } else { + myErrorCode = ERROR_CORRUPTED_FILE; + } + + myTargetBuffer = 0; + myTargetBufferEnd = 0; + myTargetBufferPtr = 0; +} + +HuffDecompressor::~HuffDecompressor() { + delete[] myCacheTable; + delete[] myBaseTable; + delete[] myData; + delete[] myDicts; +} + +bool HuffDecompressor::error() const { + return myErrorCode == ERROR_CORRUPTED_FILE; +} + +std::size_t HuffDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, std::size_t compressedSize, std::size_t maxUncompressedSize) { + if ((compressedSize == 0) || (myErrorCode == ERROR_CORRUPTED_FILE)) { + return 0; + } + if (targetBuffer != 0) { + unsigned char *sourceBuffer = new unsigned char[compressedSize]; + myTargetBuffer = targetBuffer; + myTargetBufferEnd = targetBuffer + maxUncompressedSize; + myTargetBufferPtr = targetBuffer; + if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) { + std::size_t trailSize = sizeOfTrailingEntries(sourceBuffer, compressedSize); + if (trailSize < compressedSize) { + bitsDecompress(BitReader(sourceBuffer, compressedSize - trailSize)); + } else { + myErrorCode = ERROR_CORRUPTED_FILE; + } + } + delete[] sourceBuffer; + } else { + myTargetBuffer = 0; + myTargetBufferEnd = 0; + myTargetBufferPtr = 0; + } + + return myTargetBufferPtr - myTargetBuffer; +} + +void HuffDecompressor::bitsDecompress(BitReader bits, std::size_t depth) { + if (depth > 32) { + myErrorCode = ERROR_CORRUPTED_FILE; + return; + } + + while (bits.left()) { + const unsigned long dw = (unsigned long)bits.peek(32); + const unsigned long v = myCacheTable[dw >> 24]; + unsigned long codelen = v & 0x1F; + //if ((codelen == 0) || (codelen > 32)) { + // return false; + //} + unsigned long code = dw >> (32 - codelen); + unsigned long r = (v >> 8); + if (!(v & 0x80)) { + while (code < myBaseTable[(codelen - 1) * 2]) { + codelen += 1; + code = dw >> (32 - codelen); + } + r = myBaseTable[(codelen - 1) * 2 + 1]; + } + r -= code; + //if (codelen == 0) { + // return false; + //} + if (!bits.eat(codelen)) { + return; + } + const unsigned long dicno = r >> myEntryBits; + const unsigned long off1 = 16 + (r - (dicno << myEntryBits)) * 2; + const unsigned char* dict = myDicts[dicno]; //TODO need index check + const unsigned long off2 = 16 + dict[off1] * 256 + dict[off1 + 1]; //TODO need index check + const unsigned long blen = dict[off2] * 256 + dict[off2 + 1]; //TODO need index check + const unsigned char* slice = dict + off2 + 2; + const unsigned long sliceSize = blen & 0x7fff; + if (blen & 0x8000) { + if (myTargetBufferPtr + sliceSize < myTargetBufferEnd) { + std::memcpy(myTargetBufferPtr, slice, sliceSize); + myTargetBufferPtr += sliceSize; + } else { + return; + } + } else { + bitsDecompress(BitReader(slice, sliceSize), depth + 1); + } + } +} + +std::size_t HuffDecompressor::sizeOfTrailingEntries(unsigned char* data, std::size_t size) const { + std::size_t num = 0; + std::size_t flags = myExtraFlags >> 1; + while (flags) { + if (flags & 1) { + if (num < size) { + num += readVariableWidthIntegerBE(data, size - num); + } + } + flags >>= 1; + } + return num; +} + + +std::size_t HuffDecompressor::readVariableWidthIntegerBE(unsigned char* ptr, std::size_t psize) const { + unsigned char bitsSaved = 0; + std::size_t result = 0; + while (true) { + const unsigned char oneByte = ptr[psize - 1]; + result |= (oneByte & 0x7F) << bitsSaved; + bitsSaved += 7; + psize -= 1; + if (((oneByte & 0x80) != 0) || (bitsSaved >= 28) || (psize == 0)) { + return result; + } + } +} diff --git a/reader/src/formats/pdb/HuffDecompressor.h b/reader/src/formats/pdb/HuffDecompressor.h new file mode 100644 index 0000000..76539e9 --- /dev/null +++ b/reader/src/formats/pdb/HuffDecompressor.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __HUFFDECOMPRESSOR_H__ +#define __HUFFDECOMPRESSOR_H__ + +#include <string> + +class ZLInputStream; +class BitReader; + +class HuffDecompressor { + +public: + HuffDecompressor(ZLInputStream& stream, + const std::vector<unsigned long>::const_iterator beginHuffRecordOffsetIt, + const std::vector<unsigned long>::const_iterator endHuffRecordOffsetIt, + const unsigned long endHuffDataOffset, const unsigned long extraFlags); + ~HuffDecompressor(); + + std::size_t decompress(ZLInputStream &stream, char *buffer, std::size_t compressedSize, std::size_t maxUncompressedSize); + bool error() const; +private: + std::size_t sizeOfTrailingEntries(unsigned char* data, std::size_t size) const; + std::size_t readVariableWidthIntegerBE(unsigned char* ptr, std::size_t psize) const; + void bitsDecompress(BitReader bits, std::size_t depth = 0); + +private: + unsigned long myEntryBits; + unsigned long myExtraFlags; + + unsigned long* myCacheTable; + unsigned long* myBaseTable; + unsigned char* myData; + unsigned char** myDicts; + + char* myTargetBuffer; + char* myTargetBufferEnd; + char* myTargetBufferPtr; + + enum { + ERROR_NONE, + ERROR_CORRUPTED_FILE + } myErrorCode; +}; + +#endif /* __HUFFDECOMPRESSOR_H__ */ diff --git a/reader/src/formats/pdb/MobipocketHtmlBookReader.cpp b/reader/src/formats/pdb/MobipocketHtmlBookReader.cpp new file mode 100644 index 0000000..cecbfbc --- /dev/null +++ b/reader/src/formats/pdb/MobipocketHtmlBookReader.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <algorithm> + +#include <ZLFile.h> +#include <ZLFileImage.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> + +#include "MobipocketHtmlBookReader.h" +#include "PalmDocStream.h" +#include "../html/HtmlTagActions.h" +#include "../../bookmodel/BookModel.h" + +class MobipocketHtmlImageTagAction : public HtmlTagAction { + +public: + MobipocketHtmlImageTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class MobipocketHtmlHrTagAction : public HtmlTagAction { + +public: + MobipocketHtmlHrTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class MobipocketHtmlHrefTagAction : public HtmlHrefTagAction { + +public: + MobipocketHtmlHrefTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class MobipocketHtmlGuideTagAction : public HtmlTagAction { + +public: + MobipocketHtmlGuideTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class MobipocketHtmlReferenceTagAction : public HtmlTagAction { + +public: + MobipocketHtmlReferenceTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +class MobipocketHtmlPagebreakTagAction : public HtmlTagAction { + +public: + MobipocketHtmlPagebreakTagAction(HtmlBookReader &reader); + void run(const HtmlReader::HtmlTag &tag); +}; + +MobipocketHtmlImageTagAction::MobipocketHtmlImageTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void MobipocketHtmlImageTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "RECINDEX") { + int index = std::atoi(tag.Attributes[i].Value.c_str()); + if (index > 0) { + int &imageCounter = ((MobipocketHtmlBookReader&)myReader).myImageCounter; + imageCounter = std::max(imageCounter, index); + bool stopParagraph = bookReader().paragraphIsOpen(); + if (stopParagraph) { + bookReader().endParagraph(); + } + std::string id; + ZLStringUtil::appendNumber(id, index); + bookReader().addImageReference(id); + if (stopParagraph) { + bookReader().beginParagraph(); + } + } + break; + } + } + } +} + +MobipocketHtmlHrTagAction::MobipocketHtmlHrTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void MobipocketHtmlHrTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + if (bookReader().contentsParagraphIsOpen()) { + bookReader().endContentsParagraph(); + bookReader().exitTitle(); + } + bookReader().insertEndOfSectionParagraph(); + } +} + +MobipocketHtmlHrefTagAction::MobipocketHtmlHrefTagAction(HtmlBookReader &reader) : HtmlHrefTagAction(reader) { +} + +MobipocketHtmlPagebreakTagAction::MobipocketHtmlPagebreakTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void MobipocketHtmlPagebreakTagAction::run(const HtmlReader::HtmlTag &tag) { + if (tag.Start) { + if (bookReader().contentsParagraphIsOpen()) { + bookReader().endContentsParagraph(); + bookReader().exitTitle(); + } + bookReader().insertEndOfSectionParagraph(); + } +} + +MobipocketHtmlBookReader::TOCReader::TOCReader(MobipocketHtmlBookReader &reader) : myReader(reader) { + reset(); +} + +void MobipocketHtmlBookReader::TOCReader::reset() { + myEntries.clear(); + + myIsActive = false; + myStartOffset = (std::size_t)-1; + myEndOffset = (std::size_t)-1; + myCurrentEntryText.erase(); +} + +bool MobipocketHtmlBookReader::TOCReader::rangeContainsPosition(std::size_t position) { + return (myStartOffset <= position) && (myEndOffset > position); +} + +void MobipocketHtmlBookReader::TOCReader::startReadEntry(std::size_t position) { + myCurrentReference = position; + myIsActive = true; +} + +void MobipocketHtmlBookReader::TOCReader::endReadEntry() { + if (myIsActive && !myCurrentEntryText.empty()) { + std::string converted; + myReader.myConverter->convert(converted, myCurrentEntryText); + myReader.myConverter->reset(); + myEntries[myCurrentReference] = converted; + myCurrentEntryText.erase(); + } + myIsActive = false; +} + +void MobipocketHtmlBookReader::TOCReader::appendText(const char *text, std::size_t len) { + if (myIsActive) { + myCurrentEntryText.append(text, len); + } +} + +void MobipocketHtmlBookReader::TOCReader::addReference(std::size_t position, const std::string &text) { + myEntries[position] = text; + if (rangeContainsPosition(position)) { + setEndOffset(position); + } +} + +void MobipocketHtmlBookReader::TOCReader::setStartOffset(std::size_t position) { + myStartOffset = position; + std::map<std::size_t,std::string>::const_iterator it = myEntries.lower_bound(position); + if (it != myEntries.end()) { + ++it; + if (it != myEntries.end()) { + myEndOffset = it->first; + } + } +} + +void MobipocketHtmlBookReader::TOCReader::setEndOffset(std::size_t position) { + myEndOffset = position; +} + +const std::map<std::size_t,std::string> &MobipocketHtmlBookReader::TOCReader::entries() const { + return myEntries; +} + +void MobipocketHtmlHrefTagAction::run(const HtmlReader::HtmlTag &tag) { + MobipocketHtmlBookReader &reader = (MobipocketHtmlBookReader&)myReader; + if (tag.Start) { + for (unsigned int i = 0; i < tag.Attributes.size(); ++i) { + if (tag.Attributes[i].Name == "FILEPOS") { + const std::string &value = tag.Attributes[i].Value; + if (!value.empty()) { + std::string label = "&"; + int intValue = std::atoi(value.c_str()); + if (intValue > 0) { + if (reader.myTocReader.rangeContainsPosition(tag.Offset)) { + reader.myTocReader.startReadEntry(intValue); + if (reader.myTocReader.rangeContainsPosition(intValue)) { + reader.myTocReader.setEndOffset(intValue); + } + } + reader.myFileposReferences.insert(intValue); + ZLStringUtil::appendNumber(label, intValue); + setHyperlinkType(INTERNAL_HYPERLINK); + bookReader().addHyperlinkControl(INTERNAL_HYPERLINK, label); + return; + } + } + } + } + } else { + reader.myTocReader.endReadEntry(); + } + HtmlHrefTagAction::run(tag); +} + +MobipocketHtmlGuideTagAction::MobipocketHtmlGuideTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void MobipocketHtmlGuideTagAction::run(const HtmlReader::HtmlTag &tag) { + MobipocketHtmlBookReader &reader = (MobipocketHtmlBookReader&)myReader; + reader.myInsideGuide = tag.Start; +} + +MobipocketHtmlReferenceTagAction::MobipocketHtmlReferenceTagAction(HtmlBookReader &reader) : HtmlTagAction(reader) { +} + +void MobipocketHtmlReferenceTagAction::run(const HtmlReader::HtmlTag &tag) { + MobipocketHtmlBookReader &reader = (MobipocketHtmlBookReader&)myReader; + if (reader.myInsideGuide) { + std::string title; + std::string filepos; + bool isTocReference = false; + for (std::size_t i = 0; i < tag.Attributes.size(); ++i) { + const std::string &name = tag.Attributes[i].Name; + const std::string &value = tag.Attributes[i].Value; + if (name == "TITLE") { + title = value; + } else if (name == "FILEPOS") { + filepos = value; + } else if ((name == "TYPE") && (ZLUnicodeUtil::toUpper(value) == "TOC")) { + isTocReference = true; + } + } + if (!title.empty() && !filepos.empty()) { + int position = std::atoi(filepos.c_str()); + if (position > 0) { + reader.myTocReader.addReference(position, title); + if (isTocReference) { + reader.myTocReader.setStartOffset(position); + } + } + } + } +} + +shared_ptr<HtmlTagAction> MobipocketHtmlBookReader::createAction(const std::string &tag) { + if (tag == "IMG") { + return new MobipocketHtmlImageTagAction(*this); + } else if (tag == "HR") { + return new MobipocketHtmlHrTagAction(*this); + } else if (tag == "A") { + return new MobipocketHtmlHrefTagAction(*this); + } else if (tag == "GUIDE") { + return new MobipocketHtmlGuideTagAction(*this); + } else if (tag == "REFERENCE") { + return new MobipocketHtmlReferenceTagAction(*this); + } else if (tag == "MBP:PAGEBREAK") { + return new MobipocketHtmlPagebreakTagAction(*this); + } + return HtmlBookReader::createAction(tag); +} + +void MobipocketHtmlBookReader::startDocumentHandler() { + HtmlBookReader::startDocumentHandler(); + myImageCounter = 0; + myInsideGuide = false; + myFileposReferences.clear(); + myPositionToParagraphMap.clear(); + myTocReader.reset(); +} + +bool MobipocketHtmlBookReader::tagHandler(const HtmlTag &tag) { + std::size_t paragraphNumber = myBookReader.model().bookTextModel()->paragraphsNumber(); + if (myBookReader.paragraphIsOpen()) { + --paragraphNumber; + } + myPositionToParagraphMap.push_back(std::make_pair(tag.Offset, paragraphNumber)); + return HtmlBookReader::tagHandler(tag); +} + +MobipocketHtmlBookReader::MobipocketHtmlBookReader(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding) : HtmlBookReader("", model, format, encoding), myFileName(file.path()), myTocReader(*this) { + setBuildTableOfContent(false); + setProcessPreTag(false); +} + +bool MobipocketHtmlBookReader::characterDataHandler(const char *text, std::size_t len, bool convert) { + myTocReader.appendText(text, len); + return HtmlBookReader::characterDataHandler(text, len, convert); +} + +void MobipocketHtmlBookReader::readDocument(ZLInputStream &stream) { + HtmlBookReader::readDocument(stream); + + PalmDocStream &pdStream = (PalmDocStream&)stream; + int index = pdStream.firstImageLocationIndex(myFileName); + + if (index >= 0) { + for (int i = 0; i < myImageCounter; i++) { + std::pair<int,int> imageLocation = pdStream.imageLocation(pdStream.header(), i + index); + if ((imageLocation.first > 0) && (imageLocation.second > 0)) { + std::string id; + ZLStringUtil::appendNumber(id, i + 1); + myBookReader.addImage(id, new ZLFileImage(ZLFile(myFileName), imageLocation.first, imageLocation.second)); + } + } + } + + std::vector<std::pair<std::size_t,std::size_t> >::const_iterator jt = myPositionToParagraphMap.begin(); + for (std::set<std::size_t>::const_iterator it = myFileposReferences.begin(); it != myFileposReferences.end(); ++it) { + while (jt != myPositionToParagraphMap.end() && jt->first < *it) { + ++jt; + } + if (jt == myPositionToParagraphMap.end()) { + break; + } + std::string label = "&"; + ZLStringUtil::appendNumber(label, *it); + myBookReader.addHyperlinkLabel(label, jt->second); + } + + jt = myPositionToParagraphMap.begin(); + const std::map<std::size_t,std::string> &entries = myTocReader.entries(); + for (std::map<std::size_t,std::string>::const_iterator it = entries.begin(); it != entries.end(); ++it) { + while (jt != myPositionToParagraphMap.end() && jt->first < it->first) { + ++jt; + } + if (jt == myPositionToParagraphMap.end()) { + break; + } + myBookReader.beginContentsParagraph(jt->second); + myBookReader.addContentsData(it->second); + myBookReader.endContentsParagraph(); + } +} diff --git a/reader/src/formats/pdb/MobipocketHtmlBookReader.h b/reader/src/formats/pdb/MobipocketHtmlBookReader.h new file mode 100644 index 0000000..7a35523 --- /dev/null +++ b/reader/src/formats/pdb/MobipocketHtmlBookReader.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __MOBIPOCKETHTMLBOOKREADER_H__ +#define __MOBIPOCKETHTMLBOOKREADER_H__ + +#include <set> + +#include "../html/HtmlBookReader.h" + +class MobipocketHtmlBookReader : public HtmlBookReader { + +public: + MobipocketHtmlBookReader(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding); + void readDocument(ZLInputStream &stream); + +private: + void startDocumentHandler(); + bool tagHandler(const HtmlTag &tag); + bool characterDataHandler(const char *text, std::size_t len, bool convert); + shared_ptr<HtmlTagAction> createAction(const std::string &tag); + +public: + class TOCReader { + + public: + TOCReader(MobipocketHtmlBookReader &reader); + void reset(); + + void addReference(std::size_t position, const std::string &text); + + void setStartOffset(std::size_t position); + void setEndOffset(std::size_t position); + + bool rangeContainsPosition(std::size_t position); + + void startReadEntry(std::size_t position); + void endReadEntry(); + void appendText(const char *text, std::size_t len); + + const std::map<std::size_t,std::string> &entries() const; + + private: + MobipocketHtmlBookReader &myReader; + + std::map<std::size_t,std::string> myEntries; + + bool myIsActive; + std::size_t myStartOffset; + std::size_t myEndOffset; + + std::size_t myCurrentReference; + std::string myCurrentEntryText; + }; + +private: + int myImageCounter; + const std::string myFileName; + + std::vector<std::pair<std::size_t,std::size_t> > myPositionToParagraphMap; + std::set<std::size_t> myFileposReferences; + bool myInsideGuide; + TOCReader myTocReader; + +friend class MobipocketHtmlImageTagAction; +friend class MobipocketHtmlHrefTagAction; +friend class MobipocketHtmlGuideTagAction; +friend class MobipocketHtmlReferenceTagAction; +friend class MobipocketHtmlPagebreakTagAction; +friend class TOCReader; +}; + +#endif /* __MOBIPOCKETHTMLBOOKREADER_H__ */ diff --git a/reader/src/formats/pdb/MobipocketPlugin.cpp b/reader/src/formats/pdb/MobipocketPlugin.cpp new file mode 100644 index 0000000..4832b43 --- /dev/null +++ b/reader/src/formats/pdb/MobipocketPlugin.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLEncodingConverter.h> +#include <ZLUnicodeUtil.h> +#include <ZLLanguageUtil.h> +#include <ZLImage.h> +#include <ZLFileImage.h> + +#include "PdbPlugin.h" +#include "PalmDocStream.h" +#include "MobipocketHtmlBookReader.h" + +#include "../../library/Book.h" + +bool MobipocketPlugin::acceptsFile(const ZLFile &file) const { + return PdbPlugin::fileType(file) == "BOOKMOBI"; +} + +void MobipocketPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { + MobipocketHtmlBookReader(file, model, format, encoding).readDocument(stream); +} + +bool MobipocketPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = book.file().inputStream(); + if (stream.isNull() || ! stream->open()) { + return false; + } + PdbHeader header; + if (!header.read(stream)) { + return false; + } + stream->seek(header.Offsets[0] + 16, true); + char test[5]; + test[4] = '\0'; + stream->read(test, 4); + static const std::string MOBI = "MOBI"; + if (MOBI != test) { + return PalmDocLikePlugin::readMetaInfo(book); + } + + unsigned long length; + PdbUtil::readUnsignedLongBE(*stream, length); + + stream->seek(4, false); + + unsigned long encodingCode; + PdbUtil::readUnsignedLongBE(*stream, encodingCode); + if (book.encoding().empty()) { + ZLEncodingConverterInfoPtr info = ZLEncodingCollection::Instance().info(encodingCode); + if (!info.isNull()) { + book.setEncoding(info->name()); + } + } + + stream->seek(52, false); + + unsigned long fullNameOffset; + PdbUtil::readUnsignedLongBE(*stream, fullNameOffset); + unsigned long fullNameLength; + PdbUtil::readUnsignedLongBE(*stream, fullNameLength); + + unsigned long languageCode; + PdbUtil::readUnsignedLongBE(*stream, languageCode); + book.setLanguage(ZLLanguageUtil::languageByCode(languageCode & 0xFF, (languageCode >> 8) & 0xFF)); + + stream->seek(32, false); + + unsigned long exthFlags; + PdbUtil::readUnsignedLongBE(*stream, exthFlags); + if (exthFlags & 0x40) { + stream->seek(header.Offsets[0] + 16 + length, true); + + stream->read(test, 4); + static const std::string EXTH = "EXTH"; + if (EXTH == test) { + stream->seek(4, false); + unsigned long recordsNum; + PdbUtil::readUnsignedLongBE(*stream, recordsNum); + for (unsigned long i = 0; i < recordsNum; ++i) { + unsigned long type; + PdbUtil::readUnsignedLongBE(*stream, type); + unsigned long size; + PdbUtil::readUnsignedLongBE(*stream, size); + if (size > 8) { + std::string value(size - 8, '\0'); + stream->read((char*)value.data(), size - 8); + switch (type) { + case 100: // author + { + int index = value.find(','); + if (index != -1) { + std::string part0 = value.substr(0, index); + std::string part1 = value.substr(index + 1); + ZLUnicodeUtil::utf8Trim(part0); + ZLUnicodeUtil::utf8Trim(part1); + value = part1 + ' ' + part0; + } else { + ZLUnicodeUtil::utf8Trim(value); + } + book.addAuthor(value); + break; + } + case 105: // subject + book.addTag(value); + break; + } + } + } + } + } + + stream->seek(header.Offsets[0] + fullNameOffset, true); + std::string title(fullNameLength, '\0'); + stream->read((char*)title.data(), fullNameLength); + book.setTitle(title); + + stream->close(); + return PalmDocLikePlugin::readMetaInfo(book); +} + +shared_ptr<const ZLImage> MobipocketPlugin::coverImage(const ZLFile &file) const { + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || ! stream->open()) { + return 0; + } + PdbHeader header; + if (!header.read(stream)) { + return 0; + } + stream->seek(header.Offsets[0] + 16, true); + char test[5]; + test[4] = '\0'; + stream->read(test, 4); + static const std::string MOBI = "MOBI"; + if (MOBI != test) { + return 0; + } + + unsigned long length; + PdbUtil::readUnsignedLongBE(*stream, length); + + stream->seek(104, false); + + unsigned long exthFlags; + unsigned long coverIndex = (unsigned long)-1; + unsigned long thumbIndex = (unsigned long)-1; + PdbUtil::readUnsignedLongBE(*stream, exthFlags); + if (exthFlags & 0x40) { + stream->seek(header.Offsets[0] + 16 + length, true); + + stream->read(test, 4); + static const std::string EXTH = "EXTH"; + if (EXTH != test) { + return 0; + } + stream->seek(4, false); + unsigned long recordsNum; + PdbUtil::readUnsignedLongBE(*stream, recordsNum); + for (unsigned long i = 0; i < recordsNum; ++i) { + unsigned long type; + PdbUtil::readUnsignedLongBE(*stream, type); + unsigned long size; + PdbUtil::readUnsignedLongBE(*stream, size); + switch (type) { + case 201: // coveroffset + if (size == 12) { + PdbUtil::readUnsignedLongBE(*stream, coverIndex); + } else { + stream->seek(size - 8, false); + } + break; + case 202: // thumboffset + if (size == 12) { + PdbUtil::readUnsignedLongBE(*stream, thumbIndex); + } else { + stream->seek(size - 8, false); + } + break; + default: + stream->seek(size - 8, false); + break; + } + } + } + stream->close(); + + if (coverIndex == (unsigned long)-1) { + if (thumbIndex == (unsigned long)-1) { + return 0; + } + coverIndex = thumbIndex; + } + + PalmDocStream pbStream(file); + if (!pbStream.open()) { + return 0; + } + int index = pbStream.firstImageLocationIndex(file.path()); + if (index >= 0) { + std::pair<int,int> imageLocation = pbStream.imageLocation(pbStream.header(), index + coverIndex); + if ((imageLocation.first > 0) && (imageLocation.second > 0)) { + return new ZLFileImage( + file, + imageLocation.first, + imageLocation.second + ); + } + } + return 0; +} diff --git a/reader/src/formats/pdb/PalmDocLikePlugin.cpp b/reader/src/formats/pdb/PalmDocLikePlugin.cpp new file mode 100644 index 0000000..27c03a1 --- /dev/null +++ b/reader/src/formats/pdb/PalmDocLikePlugin.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> + +#include "PdbPlugin.h" +#include "PalmDocStream.h" +#include "PalmDocLikeStream.h" + +#include "../../library/Book.h" + +bool PalmDocLikePlugin::providesMetaInfo() const { + return true; +} + +shared_ptr<ZLInputStream> PalmDocLikePlugin::createStream(const ZLFile &file) const { + return new PalmDocStream(file); +} + +const std::string &PalmDocLikePlugin::tryOpen(const ZLFile &file) const { + PalmDocStream stream(file); + stream.open(); + return stream.error(); +} diff --git a/reader/src/formats/pdb/PalmDocLikeStream.cpp b/reader/src/formats/pdb/PalmDocLikeStream.cpp new file mode 100644 index 0000000..8b99d4d --- /dev/null +++ b/reader/src/formats/pdb/PalmDocLikeStream.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLResource.h> + +#include "PalmDocLikeStream.h" + + +PalmDocLikeStream::PalmDocLikeStream(const ZLFile &file) : PdbStream(file) { +} + +PalmDocLikeStream::~PalmDocLikeStream() { + close(); +} + +bool PalmDocLikeStream::open() { + myErrorCode = ERROR_NONE; + if (!PdbStream::open()) { + myErrorCode = ERROR_UNKNOWN; + return false; + } + + if (!processZeroRecord()) { + return false; + } + + myBuffer = new char[myMaxRecordSize]; + myRecordIndex = 0; + return true; +} + +bool PalmDocLikeStream::fillBuffer() { + while (myBufferOffset == myBufferLength) { + if (myRecordIndex + 1 > myMaxRecordIndex) { + return false; + } + ++myRecordIndex; + if (!processRecord()) { + return false; + } + } + //myBufferOffset = 0; + return true; +} + +const std::string &PalmDocLikeStream::error() const { + static const ZLResource &resource = ZLResource::resource("mobipocketPlugin"); + switch (myErrorCode) { + default: + { + static const std::string EMPTY; + return EMPTY; + } + case ERROR_UNKNOWN: + return resource["unknown"].value(); + case ERROR_COMPRESSION: + return resource["unsupportedCompressionMethod"].value(); + case ERROR_ENCRYPTION: + return resource["encryptedFile"].value(); + } +} diff --git a/reader/src/formats/pdb/PalmDocLikeStream.h b/reader/src/formats/pdb/PalmDocLikeStream.h new file mode 100644 index 0000000..623a493 --- /dev/null +++ b/reader/src/formats/pdb/PalmDocLikeStream.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PALMDOCLIKESTREAM_H__ +#define __PALMDOCLIKESTREAM_H__ + +#include "PdbStream.h" + +class ZLFile; + +class PalmDocLikeStream : public PdbStream { + +public: + PalmDocLikeStream(const ZLFile &file); + ~PalmDocLikeStream(); + bool open(); + + const std::string &error() const; + //std::pair<int,int> imageLocation(int index); + //bool hasExtraSections() const; + +protected: + bool fillBuffer(); + +private: + virtual bool processRecord() = 0; + virtual bool processZeroRecord() = 0; + +protected: + unsigned short myMaxRecordSize; + std::size_t myRecordIndex; + std::size_t myMaxRecordIndex; + + enum { + ERROR_NONE, + ERROR_UNKNOWN, + ERROR_COMPRESSION, + ERROR_ENCRYPTION, + } myErrorCode; +}; + +#endif /* __PALMDOCLIKESTREAM_H__ */ diff --git a/reader/src/formats/pdb/PalmDocPlugin.cpp b/reader/src/formats/pdb/PalmDocPlugin.cpp new file mode 100644 index 0000000..c23f11c --- /dev/null +++ b/reader/src/formats/pdb/PalmDocPlugin.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "PdbPlugin.h" +#include "PalmDocStream.h" +#include "MobipocketHtmlBookReader.h" +#include "../txt/PlainTextFormat.h" +#include "../util/TextFormatDetector.h" + +bool PalmDocPlugin::acceptsFile(const ZLFile &file) const { + return PdbPlugin::fileType(file) == "TEXtREAd"; +} + +void PalmDocPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { + stream.open(); + bool readAsPalmDoc = ((PalmDocStream&)stream).hasExtraSections(); + stream.close(); + if (readAsPalmDoc) { + MobipocketHtmlBookReader(file, model, format, encoding).readDocument(stream); + } else { + SimplePdbPlugin::readDocumentInternal(file, model, format, encoding, stream); + } +} + +FormatInfoPage *PalmDocPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + shared_ptr<ZLInputStream> stream = createStream(file); + stream->open(); + bool readAsPalmDoc = ((PalmDocStream&)*stream).hasExtraSections(); + stream->close(); + if (!readAsPalmDoc) { + return new PlainTextInfoPage(dialog, file, ZLResourceKey("Text"), !TextFormatDetector().isHtml(*stream)); + } else { + return 0; + } +} diff --git a/reader/src/formats/pdb/PalmDocStream.cpp b/reader/src/formats/pdb/PalmDocStream.cpp new file mode 100644 index 0000000..e699d47 --- /dev/null +++ b/reader/src/formats/pdb/PalmDocStream.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <algorithm> + +#include <ZLFile.h> +#include <ZLResource.h> +#include <ZLZDecompressor.h> + +#include "PalmDocStream.h" +#include "DocDecompressor.h" +#include "HuffDecompressor.h" + +PalmDocStream::PalmDocStream(const ZLFile &file) : PalmDocLikeStream(file) { +} + +PalmDocStream::~PalmDocStream() { + close(); +} + +bool PalmDocStream::processRecord() { + const std::size_t currentOffset = recordOffset(myRecordIndex); + if (currentOffset < myBase->offset()) { + return false; + } + myBase->seek(currentOffset, true); + const std::size_t nextOffset = recordOffset(myRecordIndex + 1); + if (nextOffset < currentOffset) { + return false; + } + const unsigned short recordSize = nextOffset - currentOffset; + switch(myCompressionVersion) { + case 17480://'DH' // HuffCDic compression + myBufferLength = myHuffDecompressorPtr->decompress(*myBase, myBuffer, recordSize, myMaxRecordSize); + //if (myHuffDecompressorPtr->error()) { + // myErrorCode = ERROR_UNKNOWN; + //} + break; + case 2: // PalmDoc compression + myBufferLength = DocDecompressor().decompress(*myBase, myBuffer, recordSize, myMaxRecordSize); + break; + case 1: // No compression + myBufferLength = myBase->read(myBuffer, std::min(recordSize, myMaxRecordSize)); + break; + } + myBufferOffset = 0; + return true; +} + +bool PalmDocStream::processZeroRecord() { + // Uses with offset presetting to zero record offset value + PdbUtil::readUnsignedShort(*myBase, myCompressionVersion); // myBase offset: ^ + 2 + switch (myCompressionVersion) { + case 1: + case 2: + case 17480: + break; + default: + myErrorCode = ERROR_COMPRESSION; + return false; + } + myBase->seek(2, false); // myBase offset: ^ + 4 + PdbUtil::readUnsignedLongBE(*myBase, myTextLength); // myBase offset: ^ + 8 + PdbUtil::readUnsignedShort(*myBase, myTextRecordNumber); // myBase offset: ^ + 10 + + unsigned short endSectionIndex = header().Offsets.size(); + myMaxRecordIndex = std::min(myTextRecordNumber, (unsigned short)(endSectionIndex - 1)); + //TODO Insert in this point error message about uncompatible records and numRecords from Header + + PdbUtil::readUnsignedShort(*myBase, myMaxRecordSize); // myBase offset: ^ + 12 + if (myMaxRecordSize == 0) { + myErrorCode = ERROR_UNKNOWN; + return false; + } + + /* + std::cerr << "PalmDocStream::processRecord0():\n"; + std::cerr << "PDB header indentificator : " << header().Id << "\n"; + std::cerr << "PDB file system: sizeof opened : " << myBaseSize << "\n"; + std::cerr << "PDB header/record[0] max index : " << myMaxRecordIndex << "\n"; + std::cerr << "PDB record[0][0..2] compression : " << myCompressionVersion << "\n"; + std::cerr << "PDB record[0][2..4] spare : " << mySpare << "\n"; + std::cerr << "PDB record[0][4..8] text length : " << myTextLength << "\n"; + std::cerr << "PDB record[0][8..10] text records : " << myTextRecords << "\n"; + std::cerr << "PDB record[0][10..12] max record size: " << myMaxRecordSize << "\n"; + */ + + if (header().Id == "BOOKMOBI") { + unsigned short encrypted = 0; + PdbUtil::readUnsignedShort(*myBase, encrypted); // myBase offset: ^ + 14 + if (encrypted) { //Always = 2, if encrypted + myErrorCode = ERROR_ENCRYPTION; + return false; + } + } else { + myBase->seek(2, false); + } + + + if (myCompressionVersion == 17480) { + unsigned long mobiHeaderLength; + unsigned long huffSectionIndex; + unsigned long huffSectionNumber; + unsigned short extraFlags; + unsigned long initialOffset = header().Offsets[0]; // myBase offset: ^ + + myBase->seek(6, false); // myBase offset: ^ + 20 + PdbUtil::readUnsignedLongBE(*myBase, mobiHeaderLength); // myBase offset: ^ + 24 + + myBase->seek(0x70 - 24, false); // myBase offset: ^ + 102 (0x70) + PdbUtil::readUnsignedLongBE(*myBase, huffSectionIndex); // myBase offset: ^ + 106 (0x74) + PdbUtil::readUnsignedLongBE(*myBase, huffSectionNumber); // myBase offset: ^ + 110 (0x78) + + if (mobiHeaderLength >= 244) { + myBase->seek(0xF2 - 0x78, false); // myBase offset: ^ + 242 (0xF2) + PdbUtil::readUnsignedShort(*myBase, extraFlags); // myBase offset: ^ + 244 (0xF4) + } else { + extraFlags = 0; + } + /* + std::cerr << "mobi header length: " << mobiHeaderLength << "\n"; + std::cerr << "Huff's start record : " << huffSectionIndex << " from " << endSectionIndex - 1 << "\n"; + std::cerr << "Huff's records number: " << huffSectionNumber << "\n"; + std::cerr << "Huff's extraFlags : " << extraFlags << "\n"; + */ + const unsigned long endHuffSectionIndex = huffSectionIndex + huffSectionNumber; + if (endHuffSectionIndex > endSectionIndex || huffSectionNumber <= 1) { + myErrorCode = ERROR_COMPRESSION; + return false; + } + const unsigned long endHuffDataOffset = recordOffset(endHuffSectionIndex); + std::vector<unsigned long>::const_iterator beginHuffSectionOffsetIt = header().Offsets.begin() + huffSectionIndex; + // point to first Huff section + std::vector<unsigned long>::const_iterator endHuffSectionOffsetIt = header().Offsets.begin() + endHuffSectionIndex; + // point behind last Huff section + + + myHuffDecompressorPtr = new HuffDecompressor(*myBase, beginHuffSectionOffsetIt, endHuffSectionOffsetIt, endHuffDataOffset, extraFlags); + myBase->seek(initialOffset, true); // myBase offset: ^ + 14 + } + return true; +} + +bool PalmDocStream::hasExtraSections() const { + return myMaxRecordIndex < header().Offsets.size() - 1; +} + +std::pair<int,int> PalmDocStream::imageLocation(const PdbHeader &header, int index) const { + index += myMaxRecordIndex + 1; + int recordNumber = header.Offsets.size(); + if (index > recordNumber - 1) { + return std::make_pair(-1, -1); + } else { + int start = header.Offsets[index]; + int end = (index < recordNumber - 1) ? + header.Offsets[index + 1] : myBase->offset(); + return std::make_pair(start, end - start); + } +} + +int PalmDocStream::firstImageLocationIndex(const std::string &fileName) { + shared_ptr<ZLInputStream> fileStream = ZLFile(fileName).inputStream(); + if (fileStream.isNull() || !fileStream->open()) { + return -1; + } + + bool found = false; + int index = 0; + char bu[5] = { 0 }; + std::pair<int,int> firstImageLocation = imageLocation(header(), 0); + fileStream->seek(firstImageLocation.first, false); + while ((firstImageLocation.first > 0) && (firstImageLocation.second > 0)) { + if (firstImageLocation.second > 4) { + fileStream->read(bu, 4); + static const char jpegStart[2] = { (char)0xFF, (char)0xd8 }; + if (std::strncmp(bu, "BM", 2) == 0 || + std::strncmp(bu, "GIF8", 4) == 0 || + std::strncmp(bu, jpegStart, 2) == 0) { + found = true; + break; + } + fileStream->seek(firstImageLocation.second - 4, false); + } else { + fileStream->seek(firstImageLocation.second, false); + } + index++; + firstImageLocation = imageLocation(header(), index); + } + + fileStream->close(); + return found ? index : -1; +} diff --git a/reader/src/formats/pdb/PalmDocStream.h b/reader/src/formats/pdb/PalmDocStream.h new file mode 100644 index 0000000..4782a7b --- /dev/null +++ b/reader/src/formats/pdb/PalmDocStream.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PALMDOCSTREAM_H__ +#define __PALMDOCSTREAM_H__ + +#include "PalmDocLikeStream.h" + +class ZLFile; +class HuffDecompressor; + +class PalmDocStream : public PalmDocLikeStream { + +public: + PalmDocStream(const ZLFile &file); + ~PalmDocStream(); + + std::pair<int,int> imageLocation(const PdbHeader &header, int index) const; + bool hasExtraSections() const; + int firstImageLocationIndex(const std::string &fileName); + +private: + bool processRecord(); + bool processZeroRecord(); + +private: + unsigned short myCompressionVersion; + unsigned long myTextLength; //TODO: Warning: isn't used + unsigned short myTextRecordNumber; + + shared_ptr<HuffDecompressor> myHuffDecompressorPtr; +}; + +#endif /* __PALMDOCSTREAM_H__ */ diff --git a/reader/src/formats/pdb/PdbPlugin.cpp b/reader/src/formats/pdb/PdbPlugin.cpp new file mode 100644 index 0000000..69ef233 --- /dev/null +++ b/reader/src/formats/pdb/PdbPlugin.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLOptions.h> + +#include "PdbPlugin.h" +#include "../../options/FBCategoryKey.h" + +#include "../../database/booksdb/BooksDBUtil.h" +#include "../../database/booksdb/BooksDB.h" + +PdbPlugin::~PdbPlugin() { +} + +std::string PdbPlugin::fileType(const ZLFile &file) { + const std::string &extension = file.extension(); + if ((extension != "prc") && (extension != "pdb") && (extension != "mobi")) { + return ""; + } + + const std::string &fileName = file.path(); + //int index = fileName.find(':'); + //ZLFile baseFile = (index == -1) ? file : ZLFile(fileName.substr(0, index)); + ZLFile baseFile(file.physicalFilePath()); + bool upToDate = BooksDBUtil::checkInfo(baseFile); + + //ZLStringOption palmTypeOption(FBCategoryKey::BOOKS, file.path(), "PalmType", ""); + std::string palmType = BooksDB::Instance().getPalmType(fileName); + if ((palmType.length() != 8) || !upToDate) { + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull() || !stream->open()) { + return ""; + } + stream->seek(60, false); + char id[8]; + stream->read(id, 8); + stream->close(); + palmType = std::string(id, 8); + if (!upToDate) { + BooksDBUtil::saveInfo(baseFile); + } + //palmTypeOption.setValue(palmType); + BooksDB::Instance().setPalmType(fileName, palmType); + } + return palmType; +} + +bool PdbPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} diff --git a/reader/src/formats/pdb/PdbPlugin.h b/reader/src/formats/pdb/PdbPlugin.h new file mode 100644 index 0000000..9f8600b --- /dev/null +++ b/reader/src/formats/pdb/PdbPlugin.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PDBPLUGIN_H__ +#define __PDBPLUGIN_H__ + +#include <shared_ptr.h> + +#include "../FormatPlugin.h" + +class PdbPlugin : public FormatPlugin { + +public: + static std::string fileType(const ZLFile &file); + bool readLanguageAndEncoding(Book &book) const; + +protected: + PdbPlugin(); + +public: + virtual ~PdbPlugin(); +}; + +class PluckerPlugin : public PdbPlugin { + +public: + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readModel(BookModel &model) const; +}; + +class SimplePdbPlugin : public PdbPlugin { + +public: + bool readMetaInfo(Book &book) const; + bool readModel(BookModel &model) const; + +protected: + virtual shared_ptr<ZLInputStream> createStream(const ZLFile &file) const = 0; + virtual void readDocumentInternal(const ZLFile &file, BookModel &model, const class PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const; +}; + +class PalmDocLikePlugin : public SimplePdbPlugin { + +public: + bool providesMetaInfo() const; + const std::string &tryOpen(const ZLFile &file) const; + +protected: + shared_ptr<ZLInputStream> createStream(const ZLFile &file) const; +}; + +class PalmDocPlugin : public PalmDocLikePlugin { + +public: + bool acceptsFile(const ZLFile &file) const; + + void readDocumentInternal(const ZLFile &file, BookModel &model, const class PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const; + +private: + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +class MobipocketPlugin : public PalmDocLikePlugin { + +private: + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + + void readDocumentInternal(const ZLFile &file, BookModel &model, const class PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const; + shared_ptr<const ZLImage> coverImage(const ZLFile &file) const; +}; + +class EReaderPlugin : public SimplePdbPlugin { + +public: + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + const std::string &tryOpen(const ZLFile &file) const; + + void readDocumentInternal(const ZLFile &file, BookModel &model, const class PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const; +protected: + shared_ptr<ZLInputStream> createStream(const ZLFile &file) const; +}; + +class ZTXTPlugin : public SimplePdbPlugin { + +public: + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + +protected: + shared_ptr<ZLInputStream> createStream(const ZLFile &file) const; + +private: + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +inline PdbPlugin::PdbPlugin() {} + +#endif /* __PDBPLUGIN_H__ */ diff --git a/reader/src/formats/pdb/PdbReader.cpp b/reader/src/formats/pdb/PdbReader.cpp new file mode 100644 index 0000000..54dc654 --- /dev/null +++ b/reader/src/formats/pdb/PdbReader.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> + +#include "PdbReader.h" + +void PdbUtil::readUnsignedShort(ZLInputStream &stream, unsigned short &N) { + unsigned char data[2]; + stream.read((char*)data, 2); + N = (((unsigned short)data[0]) << 8) + data[1]; +} + +void PdbUtil::readUnsignedLongBE(ZLInputStream &stream, unsigned long &N) { + unsigned char data[4]; + stream.read((char*)data, 4); + N = (((unsigned long)data[0]) << 24) + + (((unsigned long)data[1]) << 16) + + (((unsigned long)data[2]) << 8) + + (unsigned long)data[3]; +} + +void PdbUtil::readUnsignedLongLE(ZLInputStream &stream, unsigned long &N) { + unsigned char data[4]; + stream.read((char*)data, 4); + N = (((unsigned long)data[3]) << 24) + + (((unsigned long)data[2]) << 16) + + (((unsigned long)data[1]) << 8) + + (unsigned long)data[0]; +} + +bool PdbHeader::read(shared_ptr<ZLInputStream> stream) { + const std::size_t startOffset = stream->offset(); + DocName.erase(); + DocName.append(32, '\0'); + stream->read((char*)DocName.data(), 32); // stream offset: +32 + + PdbUtil::readUnsignedShort(*stream, Flags); // stream offset: +34 + + stream->seek(26, false); // stream offset: +60 + + Id.erase(); + Id.append(8, '\0'); + stream->read((char*)Id.data(), 8); // stream offset: +68 + + stream->seek(8, false); // stream offset: +76 + Offsets.clear(); + unsigned short numRecords; + PdbUtil::readUnsignedShort(*stream, numRecords); // stream offset: +78 + Offsets.reserve(numRecords); + + for (int i = 0; i < numRecords; ++i) { // stream offset: +78 + 8 * records number + unsigned long recordOffset; + PdbUtil::readUnsignedLongBE(*stream, recordOffset); + Offsets.push_back(recordOffset); + stream->seek(4, false); + } + return stream->offset() == startOffset + 78 + 8 * numRecords; +} + +/*bool PdbRecord0::read(shared_ptr<ZLInputStream> stream) { + std::size_t startOffset = stream->offset(); + + PdbUtil::readUnsignedShort(*stream, CompressionType); + PdbUtil::readUnsignedShort(*stream, Spare); + PdbUtil::readUnsignedLongBE(*stream, TextLength); + PdbUtil::readUnsignedShort(*stream, TextRecords); + PdbUtil::readUnsignedShort(*stream, MaxRecordSize); + PdbUtil::readUnsignedShort(*stream, NontextOffset); + PdbUtil::readUnsignedShort(*stream, NontextOffset2); + + PdbUtil::readUnsignedLongBE(*stream, MobipocketID); + PdbUtil::readUnsignedLongBE(*stream, MobipocketHeaderSize); + PdbUtil::readUnsignedLongBE(*stream, Unknown24); + PdbUtil::readUnsignedShort(*stream, FootnoteRecs); + PdbUtil::readUnsignedShort(*stream, SidebarRecs); + + PdbUtil::readUnsignedShort(*stream, BookmarkOffset); + PdbUtil::readUnsignedShort(*stream, Unknown34); + PdbUtil::readUnsignedShort(*stream, NontextOffset3); + PdbUtil::readUnsignedShort(*stream, Unknown38); + PdbUtil::readUnsignedShort(*stream, ImagedataOffset); + PdbUtil::readUnsignedShort(*stream, ImagedataOffset2); + PdbUtil::readUnsignedShort(*stream, MetadataOffset); + PdbUtil::readUnsignedShort(*stream, MetadataOffset2); + PdbUtil::readUnsignedShort(*stream, FootnoteOffset); + PdbUtil::readUnsignedShort(*stream, SidebarOffset); + PdbUtil::readUnsignedShort(*stream, LastDataOffset); + PdbUtil::readUnsignedShort(*stream, Unknown54); + + return stream->offset() == startOffset + 56; +}*/ diff --git a/reader/src/formats/pdb/PdbReader.h b/reader/src/formats/pdb/PdbReader.h new file mode 100644 index 0000000..f32ebf5 --- /dev/null +++ b/reader/src/formats/pdb/PdbReader.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PDBREADER_H__ +#define __PDBREADER_H__ + +#include <vector> + +#include <shared_ptr.h> +#include <ZLInputStream.h> + +//class BookModel; + +class PdbUtil { + +public: + static void readUnsignedShort(ZLInputStream &stream, unsigned short &N); + static void readUnsignedLongBE(ZLInputStream &stream, unsigned long &N); + static void readUnsignedLongLE(ZLInputStream &stream, unsigned long &N); +}; + +struct PdbHeader { + std::string DocName; + unsigned short Flags; + std::string Id; + std::vector<unsigned long> Offsets; + + bool read(shared_ptr<ZLInputStream> stream); +}; + +struct PdbRecord0 { + unsigned short CompressionType; //[0..2] PalmDoc, Mobipocket, Ereader:version + unsigned short Spare; //[2..4] PalmDoc, Mobipocket + unsigned long TextLength; //[4..8] PalmDoc, Mobipocket + unsigned short TextRecords; //[8..10] PalmDoc, Mobipocket + unsigned short MaxRecordSize; //[10..12] PalmDoc, Mobipocket + unsigned short NontextOffset; //[12..14] Ereader + unsigned short NontextOffset2; //[14..16] Ereader //PalmDoc, Mobipocket: encrypted - there is conflict !!!! + + unsigned long MobipocketID; //[16..20] Mobipocket + unsigned long MobipocketHeaderSize;//[20..24] Mobipocket + unsigned long Unknown24; //[24..28] + unsigned short FootnoteRecs; //[28..30] Ereader + unsigned short SidebarRecs; //[30..32] Ereader + +// Following fields are specific for EReader pdb document specification + + unsigned short BookmarkOffset; //[32..34] + unsigned short Unknown34; //[34..36] + unsigned short NontextOffset3; //[36..38] + unsigned short Unknown38; //[38..40] + unsigned short ImagedataOffset; //[40..42] + unsigned short ImagedataOffset2; //[42..44] + unsigned short MetadataOffset; //[44..46] + unsigned short MetadataOffset2; //[46..48] + unsigned short FootnoteOffset; //[48..50] + unsigned short SidebarOffset; //[50..52] + unsigned short LastDataOffset; //[52..54] + unsigned short Unknown54; //[54..56] + + bool read(shared_ptr<ZLInputStream> stream); +//private: +// static bool readNumberBE(unsigned char* buffer, std::size_t offset, std::size_t size); +}; + +#endif /* __PDBREADER_H__ */ diff --git a/reader/src/formats/pdb/PdbStream.cpp b/reader/src/formats/pdb/PdbStream.cpp new file mode 100644 index 0000000..219a0de --- /dev/null +++ b/reader/src/formats/pdb/PdbStream.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLFile.h> + +#include "PdbStream.h" + +PdbStream::PdbStream(const ZLFile &file) : myBase(file.inputStream()) { + myBuffer = 0; +} + +PdbStream::~PdbStream() { +} + +bool PdbStream::open() { + close(); + if (myBase.isNull() || !myBase->open() || !myHeader.read(myBase)) { + return false; + } + // myBase offset: startOffset + 78 + 8 * records number ( myHeader.Offsets.size() ) + + myBase->seek(myHeader.Offsets[0], true); + // myBase offset: Offset[0] - zero record + + myBufferLength = 0; + myBufferOffset = 0; + + myOffset = 0; + + return true; +} + +std::size_t PdbStream::read(char *buffer, std::size_t maxSize) { + std::size_t realSize = 0; + while (realSize < maxSize) { + if (!fillBuffer()) { + break; + } + std::size_t size = std::min((std::size_t)(maxSize - realSize), (std::size_t)(myBufferLength - myBufferOffset)); + + if (size > 0) { + if (buffer != 0) { + std::memcpy(buffer + realSize, myBuffer + myBufferOffset, size); + } + realSize += size; + myBufferOffset += size; + } + } + + myOffset += realSize; + return realSize; +} + +void PdbStream::close() { + if (!myBase.isNull()) { + myBase->close(); + } + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void PdbStream::seek(int offset, bool absoluteOffset) { + if (absoluteOffset) { + offset -= this->offset(); + } + if (offset > 0) { + read(0, offset); + } else if (offset < 0) { + offset += this->offset(); + open(); + if (offset >= 0) { + read(0, offset); + } + } +} + +std::size_t PdbStream::offset() const { + return myOffset; +} + +std::size_t PdbStream::sizeOfOpened() { + // TODO: implement + return 0; +} + +std::size_t PdbStream::recordOffset(std::size_t index) const { + return index < myHeader.Offsets.size() ? + myHeader.Offsets[index] : myBase->sizeOfOpened(); +} diff --git a/reader/src/formats/pdb/PdbStream.h b/reader/src/formats/pdb/PdbStream.h new file mode 100644 index 0000000..f2c58f1 --- /dev/null +++ b/reader/src/formats/pdb/PdbStream.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PDBSTREAM_H__ +#define __PDBSTREAM_H__ + +#include <ZLInputStream.h> + +#include "PdbReader.h" + +class ZLFile; + +class PdbStream : public ZLInputStream { + +public: + PdbStream(const ZLFile &file); + virtual ~PdbStream(); + +protected: + virtual bool open(); + virtual void close(); + +private: + std::size_t read(char *buffer, std::size_t maxSize); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +protected: + virtual bool fillBuffer() = 0; + +protected: + std::size_t recordOffset(std::size_t index) const; + +public: + const PdbHeader &header() const; + +protected: + shared_ptr<ZLInputStream> myBase; + std::size_t myOffset; + +private: + PdbHeader myHeader; + +protected: + char *myBuffer; + unsigned short myBufferLength; + unsigned short myBufferOffset; +}; + +inline const PdbHeader &PdbStream::header() const { + return myHeader; +} + +#endif /* __PDBSTREAM_H__ */ diff --git a/reader/src/formats/pdb/PluckerBookReader.cpp b/reader/src/formats/pdb/PluckerBookReader.cpp new file mode 100644 index 0000000..61bc311 --- /dev/null +++ b/reader/src/formats/pdb/PluckerBookReader.cpp @@ -0,0 +1,528 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> +#include <vector> +#include <cctype> + +#include <ZLZDecompressor.h> +#include <ZLStringUtil.h> +#include <ZLUnicodeUtil.h> +#include <ZLImage.h> +#include <ZLFileImage.h> +#include <ZLFile.h> +#include <ZLTextStyleEntry.h> + +#include "PdbReader.h" +#include "PluckerBookReader.h" +#include "DocDecompressor.h" +#include "PluckerImages.h" +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +PluckerBookReader::PluckerBookReader(BookModel &model) : BookReader(model), EncodedTextReader(model.book()->encoding()), myFile(model.book()->file()), myFont(FT_REGULAR) { + myCharBuffer = new char[65535]; + myForcedEntry = 0; +} + +PluckerBookReader::~PluckerBookReader() { + delete[] myCharBuffer; +} + +void PluckerBookReader::safeAddControl(FBTextKind kind, bool start) { + if (myParagraphStarted) { + addControl(kind, start); + } else { + myDelayedControls.push_back(std::make_pair(kind, start)); + } +} + +void PluckerBookReader::safeAddHyperlinkControl(const std::string &id) { + if (myParagraphStarted) { + addHyperlinkControl(INTERNAL_HYPERLINK, id); + } else { + myDelayedHyperlinks.push_back(id); + } +} + +void PluckerBookReader::safeBeginParagraph() { + if (!myParagraphStarted) { + myParagraphStarted = true; + myBufferIsEmpty = true; + beginParagraph(); + if (!myParagraphStored) { + myParagraphVector->push_back(model().bookTextModel()->paragraphsNumber() - 1); + myParagraphStored = true; + } + for (std::vector<std::pair<FBTextKind,bool> >::const_iterator it = myDelayedControls.begin(); it != myDelayedControls.end(); ++it) { + addControl(it->first, it->second); + } + if (myForcedEntry != 0) { + addStyleEntry(*myForcedEntry); + } else { + addControl(REGULAR, true); + } + for (std::vector<std::string>::const_iterator it = myDelayedHyperlinks.begin(); it != myDelayedHyperlinks.end(); ++it) { + addHyperlinkControl(INTERNAL_HYPERLINK, *it); + } + myDelayedHyperlinks.clear(); + } +} + + +void PluckerBookReader::safeEndParagraph() { + if (myParagraphStarted) { + if (myBufferIsEmpty) { + static const std::string SPACE = " "; + addData(SPACE); + } + endParagraph(); + myParagraphStarted = false; + } +} + +void PluckerBookReader::processHeader(FontType font, bool start) { + if (start) { + enterTitle(); + FBTextKind kind; + switch (font) { + case FT_H1: + kind = H1; + break; + case FT_H2: + kind = H2; + break; + case FT_H3: + kind = H3; + break; + case FT_H4: + kind = H4; + break; + case FT_H5: + kind = H5; + break; + case FT_H6: + default: + kind = H6; + break; + } + pushKind(kind); + } else { + popKind(); + exitTitle(); + } +}; + +void PluckerBookReader::setFont(FontType font, bool start) { + switch (font) { + case FT_REGULAR: + break; + case FT_H1: + case FT_H2: + case FT_H3: + case FT_H4: + case FT_H5: + case FT_H6: + processHeader(font, start); + break; + case FT_BOLD: + safeAddControl(BOLD, start); + break; + case FT_TT: + safeAddControl(CODE, start); + break; + case FT_SMALL: + break; + case FT_SUB: + safeAddControl(SUB, start); + break; + case FT_SUP: + safeAddControl(SUP, start); + break; + } +} + +void PluckerBookReader::changeFont(FontType font) { + if (myFont == font) { + return; + } + setFont(myFont, false); + myFont = font; + setFont(myFont, true); +} + +/* +static void listParameters(char *ptr) { + int argc = ((unsigned char)*ptr) % 8; + std::cerr << (int)(unsigned char)*ptr << "("; + for (int i = 0; i < argc - 1; ++i) { + ++ptr; + std::cerr << (int)*ptr << ", "; + } + if (argc > 0) { + ++ptr; + std::cerr << (int)*ptr; + } + std::cerr << ")\n"; +} +*/ + +static unsigned int twoBytes(char *ptr) { + return 256 * (unsigned char)*ptr + (unsigned char)*(ptr + 1); +} + +static unsigned int fourBytes(char *ptr) { + return 65536 * twoBytes(ptr) + twoBytes(ptr + 2); +} + +static std::string fromNumber(unsigned int num) { + std::string str; + ZLStringUtil::appendNumber(str, num); + return str; +} + +void PluckerBookReader::processTextFunction(char *ptr) { + switch ((unsigned char)*ptr) { + case 0x08: + safeAddControl(INTERNAL_HYPERLINK, false); + break; + case 0x0A: + safeAddHyperlinkControl(fromNumber(twoBytes(ptr + 1))); + break; + case 0x0C: + { + int sectionNum = twoBytes(ptr + 1); + int paragraphNum = twoBytes(ptr + 3); + safeAddHyperlinkControl(fromNumber(sectionNum) + '#' + fromNumber(paragraphNum)); + myReferencedParagraphs.insert(std::make_pair(sectionNum, paragraphNum)); + break; + } + case 0x11: + changeFont((FontType)*(ptr + 1)); + break; + case 0x1A: + safeBeginParagraph(); + addImageReference(fromNumber(twoBytes(ptr + 1))); + break; + case 0x22: + if (!myParagraphStarted) { + if (myForcedEntry == 0) { + myForcedEntry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + } + myForcedEntry->setLength( + ZLTextStyleEntry::LENGTH_LEFT_INDENT, + *(ptr + 1), ZLTextStyleEntry::SIZE_UNIT_PIXEL + ); + myForcedEntry->setLength( + ZLTextStyleEntry::LENGTH_RIGHT_INDENT, + *(ptr + 2), ZLTextStyleEntry::SIZE_UNIT_PIXEL + ); + } + break; + case 0x29: + if (!myParagraphStarted) { + if (myForcedEntry == 0) { + myForcedEntry = new ZLTextStyleEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + } + switch (*(ptr + 1)) { + case 0: myForcedEntry->setAlignmentType(ALIGN_LEFT); break; + case 1: myForcedEntry->setAlignmentType(ALIGN_RIGHT); break; + case 2: myForcedEntry->setAlignmentType(ALIGN_CENTER); break; + case 3: myForcedEntry->setAlignmentType(ALIGN_JUSTIFY); break; + } + } + break; + case 0x33: // just break line instead of horizontal rule (TODO: draw horizontal rule?) + safeEndParagraph(); + break; + case 0x38: + safeEndParagraph(); + break; + case 0x40: + safeAddControl(EMPHASIS, true); + break; + case 0x48: + safeAddControl(EMPHASIS, false); + break; + case 0x53: // color setting is ignored + break; + case 0x5C: + addImageReference(fromNumber(twoBytes(ptr + 3))); + break; + case 0x60: // underlined text is ignored + break; + case 0x68: // underlined text is ignored + break; + case 0x70: // strike-through text is ignored + break; + case 0x78: // strike-through text is ignored + break; + case 0x83: + case 0x85: + { + ZLUnicodeUtil::Ucs4Char symbol = + (((unsigned char)*ptr) == 0x83) ? twoBytes(ptr + 2) : fourBytes(ptr + 2); + char utf8[6]; + int len = ZLUnicodeUtil::ucs4ToUtf8(utf8, symbol); + safeBeginParagraph(); + addData(std::string(utf8, len)); + myBufferIsEmpty = false; + myBytesToSkip = *(ptr + 1); + break; + } + case 0x8E: // custom font operations are ignored + case 0x8C: + case 0x8A: + case 0x88: + break; + case 0x90: // TODO: add table processing + case 0x92: // TODO: process table + case 0x97: // TODO: process table + break; + default: // this should be impossible + //std::cerr << "Oops... function #" << (int)(unsigned char)*ptr << "\n"; + break; + } +} + +void PluckerBookReader::processTextParagraph(char *start, char *end) { + changeFont(FT_REGULAR); + while (popKind()) {} + + myParagraphStarted = false; + myBytesToSkip = 0; + + char *textStart = start; + bool functionFlag = false; + for (char *ptr = start; ptr < end; ++ptr) { + if (*ptr == 0) { + functionFlag = true; + if (ptr > textStart) { + safeBeginParagraph(); + myConvertedTextBuffer.erase(); + myConverter->convert(myConvertedTextBuffer, textStart, ptr); + addData(myConvertedTextBuffer); + myBufferIsEmpty = false; + } + } else if (functionFlag) { + int paramCounter = ((unsigned char)*ptr) % 8; + if (end - ptr > paramCounter) { + processTextFunction(ptr); + ptr += paramCounter; + } else { + ptr = end - 1; + } + functionFlag = false; + if (myBytesToSkip > 0) { + ptr += myBytesToSkip; + myBytesToSkip = 0; + } + textStart = ptr + 1; + } else { + if ((unsigned char)*ptr == 0xA0) { + *ptr = 0x20; + } + if (!myParagraphStarted && textStart == ptr && std::isspace((unsigned char)*ptr)) { + ++textStart; + } + } + } + if (end > textStart) { + safeBeginParagraph(); + myConvertedTextBuffer.erase(); + myConverter->convert(myConvertedTextBuffer, textStart, end); + addData(myConvertedTextBuffer); + myBufferIsEmpty = false; + } + safeEndParagraph(); + if (myForcedEntry != 0) { + delete myForcedEntry; + myForcedEntry = 0; + } + myDelayedControls.clear(); +} + +void PluckerBookReader::processTextRecord(std::size_t size, const std::vector<int> &pars) { + char *start = myCharBuffer; + char *end = myCharBuffer; + + for (std::vector<int>::const_iterator it = pars.begin(); it != pars.end(); ++it) { + start = end; + end = start + *it; + if (end > myCharBuffer + size) { + return; + } + myParagraphStored = false; + processTextParagraph(start, end); + if (!myParagraphStored) { + myParagraphVector->push_back(-1); + } + } +} + +void PluckerBookReader::readRecord(std::size_t recordSize) { + unsigned short uid; + PdbUtil::readUnsignedShort(*myStream, uid); + if (uid == 1) { + PdbUtil::readUnsignedShort(*myStream, myCompressionVersion); + } else { + unsigned short paragraphs; + PdbUtil::readUnsignedShort(*myStream, paragraphs); + + unsigned short size; + PdbUtil::readUnsignedShort(*myStream, size); + + unsigned char type; + myStream->read((char*)&type, 1); + + unsigned char flags; + myStream->read((char*)&flags, 1); + + switch (type) { + case 0: // text (TODO: found sample file and test this code) + case 1: // compressed text + { + std::vector<int> pars; + for (int i = 0; i < paragraphs; ++i) { + unsigned short pSize; + PdbUtil::readUnsignedShort(*myStream, pSize); + pars.push_back(pSize); + myStream->seek(2, false); + } + + bool doProcess = false; + if (type == 0) { + doProcess = myStream->read(myCharBuffer, size) == size; + } else if (myCompressionVersion == 1) { + doProcess = + DocDecompressor().decompress(*myStream, myCharBuffer, recordSize - 8 - 4 * paragraphs, size) == size; + } else if (myCompressionVersion == 2) { + myStream->seek(2, false); + doProcess = + ZLZDecompressor(recordSize - 10 - 4 * paragraphs). + decompress(*myStream, myCharBuffer, size) == size; + } + if (doProcess) { + addHyperlinkLabel(fromNumber(uid)); + myParagraphVector = &myParagraphMap[uid]; + processTextRecord(size, pars); + if ((flags & 0x1) == 0) { + insertEndOfTextParagraph(); + } + } + break; + } + case 2: // image + case 3: // compressed image + { + ZLImage *image = 0; + const ZLFile imageFile(myFile.path(), ZLMimeType::IMAGE_PALM); + if (type == 2) { + image = new ZLFileImage(imageFile, myStream->offset(), recordSize - 8); + } else if (myCompressionVersion == 1) { + image = new DocCompressedFileImage(imageFile, myStream->offset(), recordSize - 8); + } else if (myCompressionVersion == 2) { + image = new ZCompressedFileImage(imageFile, myStream->offset() + 2, recordSize - 10); + } + if (image != 0) { + addImage(fromNumber(uid), image); + } + break; + } + case 9: // category record is ignored + break; + case 10: + unsigned short typeCode; + PdbUtil::readUnsignedShort(*myStream, typeCode); + //std::cerr << "type = " << (int)type << "; "; + //std::cerr << "typeCode = " << typeCode << "\n"; + break; + case 11: // style sheet record is ignored + break; + case 12: // font page record is ignored + break; + case 13: // TODO: process tables + case 14: // TODO: process tables + break; + case 15: // multiimage + { + unsigned short columns; + unsigned short rows; + PdbUtil::readUnsignedShort(*myStream, columns); + PdbUtil::readUnsignedShort(*myStream, rows); + PluckerMultiImage *image = new PluckerMultiImage(rows, columns, model().imageMap()); + for (int i = 0; i < size / 2 - 2; ++i) { + unsigned short us; + PdbUtil::readUnsignedShort(*myStream, us); + image->addId(fromNumber(us)); + } + addImage(fromNumber(uid), image); + break; + } + default: + //std::cerr << "type = " << (int)type << "\n"; + break; + } + } +} + +bool PluckerBookReader::readDocument() { + myStream = myFile.inputStream(); + if (myStream.isNull() || !myStream->open()) { + return false; + } + + PdbHeader header; + if (!header.read(myStream)) { + myStream->close(); + return false; + } + + setMainTextModel(); + myFont = FT_REGULAR; + + for (std::vector<unsigned long>::const_iterator it = header.Offsets.begin(); it != header.Offsets.end(); ++it) { + std::size_t currentOffset = myStream->offset(); + if (currentOffset > *it) { + break; + } + myStream->seek(*it - currentOffset, false); + if (myStream->offset() != *it) { + break; + } + std::size_t recordSize = ((it != header.Offsets.end() - 1) ? *(it + 1) : myStream->sizeOfOpened()) - *it; + readRecord(recordSize); + } + myStream->close(); + + for (std::set<std::pair<int,int> >::const_iterator it = myReferencedParagraphs.begin(); it != myReferencedParagraphs.end(); ++it) { + std::map<int,std::vector<int> >::const_iterator jt = myParagraphMap.find(it->first); + if (jt != myParagraphMap.end()) { + for (unsigned int k = it->second; k < jt->second.size(); ++k) { + if (jt->second[k] != -1) { + addHyperlinkLabel(fromNumber(it->first) + '#' + fromNumber(it->second), jt->second[k]); + break; + } + } + } + } + myReferencedParagraphs.clear(); + myParagraphMap.clear(); + return true; +} diff --git a/reader/src/formats/pdb/PluckerBookReader.h b/reader/src/formats/pdb/PluckerBookReader.h new file mode 100644 index 0000000..1078f37 --- /dev/null +++ b/reader/src/formats/pdb/PluckerBookReader.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PLUCKERBOOKREADER_H__ +#define __PLUCKERBOOKREADER_H__ + +#include <set> +#include <map> + +#include <ZLEncodingConverter.h> + +#include "../../bookmodel/BookReader.h" +#include "../EncodedTextReader.h" + +class PluckerBookReader : public BookReader, public EncodedTextReader { + +public: + PluckerBookReader(BookModel &model); + ~PluckerBookReader(); + + bool readDocument(); + +private: + enum FontType { + FT_REGULAR = 0, + FT_H1 = 1, + FT_H2 = 2, + FT_H3 = 3, + FT_H4 = 4, + FT_H5 = 5, + FT_H6 = 6, + FT_BOLD = 7, + FT_TT = 8, + FT_SMALL = 9, + FT_SUB = 10, + FT_SUP = 11 + }; + + void readRecord(std::size_t recordSize); + void processTextRecord(std::size_t size, const std::vector<int> &pars); + void processTextParagraph(char *start, char *end); + void processTextFunction(char *ptr); + void setFont(FontType font, bool start); + void changeFont(FontType font); + + void safeAddControl(FBTextKind kind, bool start); + void safeAddHyperlinkControl(const std::string &id); + void safeBeginParagraph(); + void safeEndParagraph(); + + void processHeader(FontType font, bool start); + +private: + const ZLFile myFile; + shared_ptr<ZLInputStream> myStream; + FontType myFont; + char *myCharBuffer; + std::string myConvertedTextBuffer; + bool myParagraphStarted; + bool myBufferIsEmpty; + ZLTextStyleEntry *myForcedEntry; + std::vector<std::pair<FBTextKind,bool> > myDelayedControls; + std::vector<std::string> myDelayedHyperlinks; + unsigned short myCompressionVersion; + unsigned char myBytesToSkip; + + std::set<std::pair<int, int> > myReferencedParagraphs; + std::map<int, std::vector<int> > myParagraphMap; + std::vector<int> *myParagraphVector; + bool myParagraphStored; +}; + +#endif /* __PLUCKERBOOKREADER_H__ */ diff --git a/reader/src/formats/pdb/PluckerImages.cpp b/reader/src/formats/pdb/PluckerImages.cpp new file mode 100644 index 0000000..db291ab --- /dev/null +++ b/reader/src/formats/pdb/PluckerImages.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> +#include <ZLZDecompressor.h> +#include <ZLStringUtil.h> + +#include "PluckerImages.h" +#include "DocDecompressor.h" + +const shared_ptr<std::string> ZCompressedFileImage::stringData() const { + shared_ptr<ZLInputStream> stream = myFile.inputStream(); + + shared_ptr<std::string> imageData = new std::string(); + + if (!stream.isNull() && stream->open()) { + stream->seek(myOffset, false); + ZLZDecompressor decompressor(myCompressedSize); + + static const std::size_t charBufferSize = 2048; + char *charBuffer = new char[charBufferSize]; + std::vector<std::string> buffer; + + std::size_t s; + do { + s = decompressor.decompress(*stream, charBuffer, charBufferSize); + if (s != 0) { + buffer.push_back(std::string()); + buffer.back().append(charBuffer, s); + } + } while (s == charBufferSize); + ZLStringUtil::append(*imageData, buffer); + + delete[] charBuffer; + } + + return imageData; +} + +const shared_ptr<std::string> DocCompressedFileImage::stringData() const { + shared_ptr<ZLInputStream> stream = myFile.inputStream(); + + shared_ptr<std::string> imageData = new std::string(); + + if (!stream.isNull() && stream->open()) { + stream->seek(myOffset, false); + char *buffer = new char[65535]; + std::size_t uncompressedSize = DocDecompressor().decompress(*stream, buffer, myCompressedSize, 65535); + imageData->append(buffer, uncompressedSize); + delete[] buffer; + } + + return imageData; +} + +shared_ptr<const ZLImage> PluckerMultiImage::subImage(unsigned int row, unsigned int column) const { + unsigned int index = row * myColumns + column; + if (index >= myIds.size()) { + return 0; + } + ZLImageMap::const_iterator entry = myImageMap.find(myIds[index]); + return (entry != myImageMap.end()) ? entry->second : 0; +} diff --git a/reader/src/formats/pdb/PluckerImages.h b/reader/src/formats/pdb/PluckerImages.h new file mode 100644 index 0000000..3269a29 --- /dev/null +++ b/reader/src/formats/pdb/PluckerImages.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PLUCKERIMAGES_H__ +#define __PLUCKERIMAGES_H__ + +#include <string> + +#include <ZLImage.h> +#include <ZLFile.h> +#include "../../bookmodel/BookModel.h" + +class ZCompressedFileImage : public ZLSingleImage { + +public: + ZCompressedFileImage(const ZLFile &file, std::size_t offset, std::size_t size); + const shared_ptr<std::string> stringData() const; + +private: + const ZLFile myFile; + const std::size_t myOffset; + const std::size_t myCompressedSize; +}; + +class DocCompressedFileImage : public ZLSingleImage { + +public: + DocCompressedFileImage(const ZLFile &file, std::size_t offset, std::size_t compressedSize); + const shared_ptr<std::string> stringData() const; + +private: + const ZLFile myFile; + const std::size_t myOffset; + const std::size_t myCompressedSize; +}; + +class PluckerMultiImage : public ZLMultiImage { + +public: + PluckerMultiImage(unsigned int rows, unsigned int columns, const ZLImageMap &imageMap); + + void addId(const std::string &id); + + unsigned int rows() const; + unsigned int columns() const; + shared_ptr<const ZLImage> subImage(unsigned int row, unsigned int column) const; + +private: + unsigned int myRows, myColumns; + const ZLImageMap &myImageMap; + std::vector<std::string> myIds; +}; + +inline ZCompressedFileImage::ZCompressedFileImage(const ZLFile &file, std::size_t offset, std::size_t compressedSize) : ZLSingleImage(file.mimeType()), myFile(file), myOffset(offset), myCompressedSize(compressedSize) {} + +inline DocCompressedFileImage::DocCompressedFileImage(const ZLFile &file, std::size_t offset, std::size_t compressedSize) : ZLSingleImage(file.mimeType()), myFile(file), myOffset(offset), myCompressedSize(compressedSize) {} + +inline PluckerMultiImage::PluckerMultiImage(unsigned int rows, unsigned int columns, const ZLImageMap &imageMap) : myRows(rows), myColumns(columns), myImageMap(imageMap) {} +inline void PluckerMultiImage::addId(const std::string &id) { myIds.push_back(id); } +inline unsigned int PluckerMultiImage::rows() const { return myRows; } +inline unsigned int PluckerMultiImage::columns() const { return myColumns; } + +#endif /* __PLUCKERIMAGES_H__ */ diff --git a/reader/src/formats/pdb/PluckerPlugin.cpp b/reader/src/formats/pdb/PluckerPlugin.cpp new file mode 100644 index 0000000..1ec89ba --- /dev/null +++ b/reader/src/formats/pdb/PluckerPlugin.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> + +#include "PdbPlugin.h" +#include "PluckerBookReader.h" +#include "PluckerTextStream.h" + +#include "../../library/Book.h" + +bool PluckerPlugin::providesMetaInfo() const { + return false; +} + +bool PluckerPlugin::acceptsFile(const ZLFile &file) const { + return PdbPlugin::fileType(file) == "DataPlkr"; +} + +bool PluckerPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = new PluckerTextStream(book.file()); + detectEncodingAndLanguage(book, *stream); + if (book.encoding().empty()) { + return false; + } + + return true; +} + +bool PluckerPlugin::readModel(BookModel &model) const { + return PluckerBookReader(model).readDocument(); +} diff --git a/reader/src/formats/pdb/PluckerTextStream.cpp b/reader/src/formats/pdb/PluckerTextStream.cpp new file mode 100644 index 0000000..01291eb --- /dev/null +++ b/reader/src/formats/pdb/PluckerTextStream.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLFile.h> +#include <ZLZDecompressor.h> + +#include "PluckerTextStream.h" +#include "PdbReader.h" +#include "DocDecompressor.h" + +PluckerTextStream::PluckerTextStream(const ZLFile &file) : PdbStream(file) { + myFullBuffer = 0; +} + +PluckerTextStream::~PluckerTextStream() { + close(); +} + +bool PluckerTextStream::open() { + if (!PdbStream::open()) { + return false; + } + + PdbUtil::readUnsignedShort(*myBase, myCompressionVersion); + + myBuffer = new char[65536]; + myFullBuffer = new char[65536]; + + myRecordIndex = 0; + + return true; +} + +bool PluckerTextStream::fillBuffer() { + while (myBufferOffset == myBufferLength) { + if (myRecordIndex + 1 > header().Offsets.size() - 1) { + return false; + } + ++myRecordIndex; + const std::size_t currentOffset = recordOffset(myRecordIndex); + if (currentOffset < myBase->offset()) { + return false; + } + myBase->seek(currentOffset, true); + const std::size_t nextOffset = recordOffset(myRecordIndex + 1); + if (nextOffset < currentOffset) { + return false; + } + processRecord(nextOffset - currentOffset); + } + return true; +} + +void PluckerTextStream::close() { + if (myFullBuffer != 0) { + delete[] myFullBuffer; + myFullBuffer = 0; + } + PdbStream::close(); +} + +void PluckerTextStream::processRecord(std::size_t recordSize) { + myBase->seek(2, false); + + unsigned short paragraphs; + PdbUtil::readUnsignedShort(*myBase, paragraphs); + + unsigned short size; + PdbUtil::readUnsignedShort(*myBase, size); + + unsigned char type; + myBase->read((char*)&type, 1); + if (type > 1) { // this record is not text record + return; + } + + myBase->seek(1, false); + + std::vector<int> pars; + for (int i = 0; i < paragraphs; ++i) { + unsigned short pSize; + PdbUtil::readUnsignedShort(*myBase, pSize); + pars.push_back(pSize); + myBase->seek(2, false); + } + + bool doProcess = false; + if (type == 0) { + doProcess = myBase->read(myFullBuffer, size) == size; + } else if (myCompressionVersion == 1) { + doProcess = + DocDecompressor().decompress(*myBase, myFullBuffer, recordSize - 8 - 4 * paragraphs, size) == size; + } else if (myCompressionVersion == 2) { + myBase->seek(2, false); + doProcess = + ZLZDecompressor(recordSize - 10 - 4 * paragraphs).decompress(*myBase, myFullBuffer, size) == size; + } + if (doProcess) { + myBufferLength = 0; + myBufferOffset = 0; + + char *start = myFullBuffer; + char *end = myFullBuffer; + + for (std::vector<int>::const_iterator it = pars.begin(); it != pars.end(); ++it) { + start = end; + end = start + *it; + if (end > myFullBuffer + size) { + break; + } + processTextParagraph(start, end); + } + } +} + +void PluckerTextStream::processTextParagraph(char *start, char *end) { + char *textStart = start; + bool functionFlag = false; + for (char *ptr = start; ptr < end; ++ptr) { + if (*ptr == 0) { + functionFlag = true; + if (ptr != textStart) { + std::memcpy(myBuffer + myBufferLength, textStart, ptr - textStart); + myBufferLength += ptr - textStart; + } + } else if (functionFlag) { + int paramCounter = ((unsigned char)*ptr) % 8; + if (end - ptr > paramCounter + 1) { + ptr += paramCounter; + } else { + ptr = end - 1; + } + functionFlag = false; + textStart = ptr + 1; + } + } + if (end != textStart) { + std::memcpy(myBuffer + myBufferLength, textStart, end - textStart); + myBufferLength += end - textStart; + } +} diff --git a/reader/src/formats/pdb/PluckerTextStream.h b/reader/src/formats/pdb/PluckerTextStream.h new file mode 100644 index 0000000..70c1182 --- /dev/null +++ b/reader/src/formats/pdb/PluckerTextStream.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PLUCKERTEXTSTREAM_H__ +#define __PLUCKERTEXTSTREAM_H__ + +#include "PdbStream.h" + +class ZLFile; + +class PluckerTextStream : public PdbStream { + +public: + PluckerTextStream(const ZLFile &file); + ~PluckerTextStream(); + bool open(); + void close(); + +private: + bool fillBuffer(); + +private: + void processRecord(std::size_t recordSize); + void processTextParagraph(char *start, char *end); + +private: + unsigned short myCompressionVersion; + char *myFullBuffer; + std::size_t myRecordIndex; +}; + +#endif /* __PLUCKERTEXTSTREAM_H__ */ diff --git a/reader/src/formats/pdb/PmlBookReader.cpp b/reader/src/formats/pdb/PmlBookReader.cpp new file mode 100644 index 0000000..e365983 --- /dev/null +++ b/reader/src/formats/pdb/PmlBookReader.cpp @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLTextParagraph.h> +#include <ZLUnicodeUtil.h> +#include <ZLStringUtil.h> +#include <ZLTextStyleEntry.h> + +#include "PmlBookReader.h" +#include "../../bookmodel/BookModel.h" + +PmlBookReader::PmlBookReader(BookReader &bookReader, const PlainTextFormat&, const std::string &encoding) : PmlReader(encoding), myBookReader(bookReader) { +} + +PmlBookReader::~PmlBookReader() { +} + +bool PmlBookReader::readDocument(ZLInputStream& stream) { + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + myParagraphIsEmpty = true; + bool code = PmlReader::readDocument(stream); + myBookReader.endParagraph(); + return code; +} + +void PmlBookReader::addCharData(const char *data, std::size_t len, bool convert) { + if (!myBookReader.paragraphIsOpen()) { + myBookReader.beginParagraph(); + } + static std::string newString; + if (len != 0) { + if (!myConverter.isNull() && convert) { + myConverter->convert(newString, data, data + len); + } else { + newString.append(data, len); + } + if (myState.SmallCaps) { + myBookReader.addData(ZLUnicodeUtil::toUpper(newString)); + } else { + myBookReader.addData(newString); + } + newString.erase(); + if (myParagraphIsEmpty) { + myParagraphIsEmpty = false; + } + } +} + +void PmlBookReader::switchFontProperty(FontProperty property) { + if (!myBookReader.paragraphIsOpen()) { + myBookReader.beginParagraph(); + } + switch (property) { + case FONT_BOLD: + if (myState.Bold) { + myBookReader.pushKind(STRONG); + } else { + myBookReader.popKind(); + } + myBookReader.addControl(STRONG, myState.Bold); + break; + case FONT_ITALIC: + if (myState.Italic) { + if (!myState.Bold) { + myBookReader.pushKind(EMPHASIS); + myBookReader.addControl(EMPHASIS, true); + } else { + myBookReader.popKind(); + myBookReader.addControl(STRONG, false); + + myBookReader.pushKind(EMPHASIS); + myBookReader.addControl(EMPHASIS, true); + myBookReader.pushKind(STRONG); + myBookReader.addControl(STRONG, true); + } + } else { + if (!myState.Bold) { + myBookReader.addControl(EMPHASIS, false); + myBookReader.popKind(); + } else { + myBookReader.addControl(STRONG, false); + myBookReader.popKind(); + myBookReader.addControl(EMPHASIS, false); + myBookReader.popKind(); + + myBookReader.pushKind(STRONG); + myBookReader.addControl(STRONG, true); + } + } + break; + case FONT_UNDERLINED: + break; + case FONT_SUBSCRIPT: //don't have to be mixed with other style tags + if (myState.Subscript) { + myBookReader.pushKind(SUB); + } else { + myBookReader.popKind(); + } + myBookReader.addControl(SUB, myState.Subscript); + break; + case FONT_SUPERSCRIPT: //Should not be mixed with other style tags + if (myState.Superscript) { + myBookReader.pushKind(SUP); + } else { + myBookReader.popKind(); + } + myBookReader.addControl(SUP, myState.Superscript); + break; + } +} + +void PmlBookReader::newLine() { + if (myBookReader.paragraphIsOpen()) { + myBookReader.endParagraph(); + } + if (myParagraphIsEmpty) { + myBookReader.beginParagraph(ZLTextParagraph::EMPTY_LINE_PARAGRAPH); + myBookReader.endParagraph(); + } else { + myParagraphIsEmpty = true; + } + newParagraph(); +} + +void PmlBookReader::newPage() { + if (myBookReader.paragraphIsOpen()) { + myBookReader.endParagraph(); + } + //newLine(); + newParagraph(); +} + +void PmlBookReader::newParagraph() { + if (myBookReader.paragraphIsOpen()) { + myBookReader.endParagraph(); + } + myBookReader.beginParagraph(); + if (myState.Alignment != ALIGN_UNDEFINED) { + setAlignment(); + } + if (myState.FontSize != NORMAL) { + setFontSize(); + } + if (myState.IndentBlockOn && (myState.Indent != 0)) { + setIndent(); + } +} + +void PmlBookReader::setAlignment() { + ZLTextStyleEntry entry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + entry.setAlignmentType(myState.Alignment); + myBookReader.addStyleEntry(entry); +} + +void PmlBookReader::setIndent() { + ZLTextStyleEntry entry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + entry.setLength(ZLTextStyleEntry::LENGTH_FIRST_LINE_INDENT_DELTA, 0, ZLTextStyleEntry::SIZE_UNIT_PERCENT); + entry.setLength(ZLTextStyleEntry::LENGTH_LEFT_INDENT, (short)myState.Indent, ZLTextStyleEntry::SIZE_UNIT_PERCENT); + myBookReader.addStyleEntry(entry); +} + +void PmlBookReader::setFontSize() { + if (!myBookReader.paragraphIsOpen()) { + myBookReader.beginParagraph(); + } + ZLTextStyleEntry entry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + switch(myState.FontSize) { + case SMALLER: + entry.setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_SMALLER, true); + break; + case LARGER: + entry.setFontModifier(ZLTextStyleEntry::FONT_MODIFIER_LARGER, true); + break; + default: + break; + } + myBookReader.addStyleEntry(entry); +} + +void PmlBookReader::addLink(FBTextKind kind, const std::string &id, bool on) { + switch (kind) { + case INTERNAL_HYPERLINK: + case FOOTNOTE: + //case EXTERNAL_HYPERLINK: + //case BOOK_HYPERLINK: + if (on) { + myBookReader.addHyperlinkControl(kind, id); + } else { + myBookReader.addControl(kind, false); + } + break; + default: + break; + } +} + +void PmlBookReader::addLinkLabel(const std::string &label) { + myBookReader.addHyperlinkLabel(label); +} + +void PmlBookReader::addImageReference(const std::string &id) { + const bool stopParagraph = myBookReader.paragraphIsOpen(); + if (stopParagraph) { + myBookReader.endParagraph(); + } + myBookReader.addImageReference(id); + if (stopParagraph) { + myBookReader.beginParagraph(); + } +} diff --git a/reader/src/formats/pdb/PmlBookReader.h b/reader/src/formats/pdb/PmlBookReader.h new file mode 100644 index 0000000..22944b4 --- /dev/null +++ b/reader/src/formats/pdb/PmlBookReader.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PMLBOOKREADER_H__ +#define __PMLBOOKREADER_H__ + +#include <string> + +#include "PmlReader.h" +#include "../../bookmodel/BookReader.h" +#include "../txt/PlainTextFormat.h" + +class PmlBookReader : public PmlReader { + +public: + PmlBookReader(BookReader &bookReader, const PlainTextFormat &format, const std::string &encoding); + ~PmlBookReader(); + + bool readDocument(ZLInputStream &stream); + +protected: + void addCharData(const char *data, std::size_t len, bool convert); + void addLink(FBTextKind kind, const std::string &id, bool on); + void addLinkLabel(const std::string &label); + void addImageReference(const std::string &id); + void switchFontProperty(FontProperty property); + void setFontSize(); + void newLine(); + void newPage(); + void newParagraph(); + +private: + void setAlignment(); + void setIndent(); + +private: + BookReader& myBookReader; + bool myParagraphIsEmpty; + + /*FontType myFont; + char *myCharBuffer; + std::string myConvertedTextBuffer; + bool myParagraphStarted; + bool myBufferIsEmpty; + ZLTextStyleEntry *myForcedEntry; + std::vector<std::pair<FBTextKind,bool> > myDelayedControls; + std::vector<std::string> myDelayedHyperlinks; + unsigned short myCompressionVersion; + unsigned char myBytesToSkip; + + std::set<std::pair<int, int> > myReferencedParagraphs; + std::map<int, std::vector<int> > myParagraphMap; + std::vector<int> *myParagraphVector; + bool myParagraphStored;*/ +}; + +#endif /* __PMLBOOKREADER_H__ */ diff --git a/reader/src/formats/pdb/PmlReader.cpp b/reader/src/formats/pdb/PmlReader.cpp new file mode 100644 index 0000000..712a6e0 --- /dev/null +++ b/reader/src/formats/pdb/PmlReader.cpp @@ -0,0 +1,407 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Information about Palm Markup Language was taken from: + * http://www.m.ereader.com/ereader/help/dropbook/pml.htm + * http://ccit205.wikispaces.com/Palm+Markup+Language+(PML) + */ + +#include <cstdlib> +#include <cctype> + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "PmlReader.h" + +static const int pmlStreamBufferSize = 4096; + +const std::string PmlReader::ourDefaultParameter = ""; + +PmlReader::PmlReader(const std::string &encoding) : EncodedTextReader(encoding) { +} + +PmlReader::~PmlReader() { +} + +bool PmlReader::readDocument(ZLInputStream& stream) { + myStreamBuffer = new char[pmlStreamBufferSize]; + + myIsInterrupted = false; + + myState.Italic = false; + myState.Bold = false; + myState.Underlined = false; + myState.SmallCaps = false; + myState.Subscript = false; + myState.Superscript = false; + myState.Alignment = ALIGN_UNDEFINED; + myState.FontSize = NORMAL; + myState.Indent = 0; + myState.IndentBlockOn = false; + myState.BoldBlockOn = false; + myState.FootnoteLinkOn = false; + myState.InternalLinkOn = false; + myState.InvisibleText = false; + + bool code = parseDocument(stream); + + delete[] myStreamBuffer; + + return code; +} + +bool PmlReader::parseDocument(ZLInputStream &stream) { + enum { + READ_NORMAL_DATA, + READ_TAG, + READ_TAG_PARAMETER, + } parserState = READ_NORMAL_DATA; + + std::size_t tagNameLength = 0; + std::string tagName; + std::string parameterString; + + bool startParameterReading = false; + std::size_t tagCounter = 0; + static bool FLAG = true; + + while (!myIsInterrupted) { + const char *ptr = myStreamBuffer; + const char *end = myStreamBuffer + stream.read(myStreamBuffer, pmlStreamBufferSize); + if (ptr == end) { + break; + } + const char *dataStart = ptr; + bool readNextChar = true; + while (ptr != end) { + switch (parserState) { + case READ_NORMAL_DATA: + if (*ptr == '\n') { + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + newLine(); + FLAG = true; + dataStart = ptr + 1; + } else if (FLAG && std::isspace(*ptr)) { + } else { + FLAG = false; + if (*ptr == '\\') { + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + tagName.erase(); + parserState = READ_TAG; + } + } + break; + case READ_TAG: + if ((ptr == dataStart) && (tagName.empty())) { + if (*ptr == '\\') { + processCharData(ptr, 1); + dataStart = ptr + 1; + parserState = READ_NORMAL_DATA; + } else { + tagNameLength = findTagLength(ptr); + if (tagNameLength == 0) { + dataStart = ptr + 1; + parserState = READ_NORMAL_DATA; + ++tagCounter; + } else { + --tagNameLength; + } + } + } else { + if (tagNameLength == 0) { + tagName.append(dataStart, ptr - dataStart); + if (*ptr == '=') { + dataStart = ptr + 1; + parameterString.erase(); + parserState = READ_TAG_PARAMETER; + ++tagCounter; + } else { + readNextChar = false; + processTag(tagName); + dataStart = ptr; + parserState = READ_NORMAL_DATA; + ++tagCounter; + } + } else { + --tagNameLength; + } + } + break; + case READ_TAG_PARAMETER: + if (*ptr == '"') { + if (!startParameterReading) { + startParameterReading = true; + dataStart = ptr + 1; + } else { + parameterString.append(dataStart, ptr - dataStart); + processTag(tagName, parameterString); + parserState = READ_NORMAL_DATA; + dataStart = ptr + 1; + startParameterReading = false; + } + } + break; + } + if (readNextChar) { + ++ptr; + } else { + readNextChar = true; + } + } + if (dataStart < end) { + switch (parserState) { + case READ_NORMAL_DATA: + processCharData(dataStart, end - dataStart); + case READ_TAG: + tagName.append(dataStart, end - dataStart); + break; + case READ_TAG_PARAMETER: + parameterString.append(dataStart, end - dataStart); + break; + default: + break; + } + } + } + return myIsInterrupted; +} + +std::size_t PmlReader::findTagLength(const char* ptr) { + switch(*ptr) { // tag action description | close | support | + case 'p': // new page | - | + | + case 'x': // new chapter and new page | + | + | + case 'c': // center alignment block | + | + | + case 'r': // right alignment block | + | + | + case 'i': // italize block | + | + | + case 'u': // underlined block | + | + | + case 'o': // overstrike block | + | - | + case 'v': // invisible text block | + | + | + case 't': // indent block | + | + | + case 'T': // indent with value | - | + | + case 'w': // embed text width rule | - | - | + case 'n': // switch to normal font | - | + | + case 's': // switch to std font |+ or \n| + | + case 'b': // switch to bold font (deprecated) |+ or \n| - | + case 'l': // switch to large font |+ or \n| + | + case 'B': // mark text as bold | + | + | + case 'k': // smaller font size and uppercase | + | + | + case 'm': // insert named image | - | + | + case 'q': // reference to another spot | + | + | + case 'Q': // link anchor for \q reference | - | + | + case '-': // soft hyphen | - | - | + case 'I': // reference index item | - | - | + return 1; + case 'X': // XN - new chapter, n indent level | + | - | + case 'S': // Sp - mark text as superscript | + | + | + // Sb - mark text as subscript | + | + | + // Sd - link to a sidebar | + | - | + case 'C': // CN - chapter title + indent level| - | - | + case 'F': // Fn - link to a footnote | + | + | + return 2; + default: + return 0; + } +} + + +void PmlReader::interrupt() { + myIsInterrupted = true; +} + + +void PmlReader::processTag(std::string &tagName, const std::string ¶meter) { + const char tagDeterminant = *tagName.data(); + switch (tagDeterminant) { + case 'p': + newPage(); + break; + case 'x': + //TODO add close tag processing + newPage(); + break; + case 'B': + if (!myState.BoldBlockOn) { + processFontProperty(FONT_BOLD); + } + break; + case 'i': + processFontProperty(FONT_ITALIC); + break; + case 'u': + processFontProperty(FONT_UNDERLINED); + break; + case 'v': + myState.InvisibleText = !myState.InvisibleText;; + break; + case 'c': + processAlignment(ALIGN_CENTER); + break; + case 'r': + processAlignment(ALIGN_RIGHT); + break; + case 'n': + processFontSize(NORMAL); + break; + case 'b': + myState.BoldBlockOn = !myState.BoldBlockOn; + processFontProperty(FONT_BOLD); + break; + case 's': + processFontSize(SMALLER); + break; + case 'l': + processFontSize(LARGER); + break; + case 'k': + myState.SmallCaps = !myState.SmallCaps; + processFontSize(SMALLER); + break; + case 'S': + if (tagName == "Sb") { + processFontProperty(FONT_SUBSCRIPT); + } else if (tagName == "Sp") { + processFontProperty(FONT_SUPERSCRIPT); + } else if (tagName == "Sd") { + //processSidebarLink(); + } + break; + case 't': + processIndent(); + break; + case 'T': + processIndent(parameter); + myState.IndentBlockOn = false; + break; + case 'w': + //addHorizontalRule(parameter); + break; + case 'F': + processLink(FOOTNOTE, parameter); + break; + case 'q': + processLink(INTERNAL_HYPERLINK, parameter); + break; + case 'Q': + addLinkLabel(parameter); + break; + case 'm': + addImageReference(parameter); + break; + default: + //std::cerr << "PmlReader: unsupported tag: name: " << tagName << " parameter: " << parameter << "\n"; + break; + } +} + +void PmlReader::processCharData(const char* data, std::size_t len, bool convert) { + if(!myState.InvisibleText) { + addCharData(data, len, convert); + } +} + +void PmlReader::processFontProperty(PmlReader::FontProperty property) { + switch (property) { + case FONT_BOLD: + myState.Bold = !myState.Bold; + switchFontProperty(FONT_BOLD); + break; + case FONT_ITALIC: + myState.Italic = !myState.Italic; + switchFontProperty(FONT_ITALIC); + break; + case FONT_UNDERLINED: + myState.Underlined = !myState.Underlined; + switchFontProperty(FONT_UNDERLINED); + break; + case FONT_SUBSCRIPT: + myState.Subscript = !myState.Subscript; + switchFontProperty(FONT_SUBSCRIPT); + break; + case FONT_SUPERSCRIPT: + myState.Superscript = !myState.Superscript; + switchFontProperty(FONT_SUPERSCRIPT); + break; + } +} + +void PmlReader::processAlignment(ZLTextAlignmentType alignment) { + if (myState.Alignment != alignment) { + myState.Alignment = alignment; + } else { + myState.Alignment = ALIGN_UNDEFINED; + } + newParagraph(); +} + +void PmlReader::processFontSize(FontSizeType sizeType) { + if (myState.FontSize != sizeType) { + myState.FontSize = sizeType; + } else { + myState.FontSize = NORMAL; + } + setFontSize(); +} + +void PmlReader::processIndent(const std::string& parameter) { + int indentPercentSize = 5; + if (!parameter.empty()) { + const int index = parameter.find('%'); + if (index != -1) { + const std::string indentValueStr = parameter.substr(0, index); + indentPercentSize = std::atoi(indentValueStr.data()); + } else { + indentPercentSize = 5; + } + } + if (!myState.IndentBlockOn) { + myState.Indent = indentPercentSize; + } else { + myState.Indent = 0; + } + myState.IndentBlockOn = !myState.IndentBlockOn; + newParagraph(); +} + +void PmlReader::processLink(FBTextKind kind, const std::string ¶meter) { + switch(kind) { + case FOOTNOTE: + myState.FootnoteLinkOn = !myState.FootnoteLinkOn; + addLink(FOOTNOTE, parameter, myState.FootnoteLinkOn); + break; + case INTERNAL_HYPERLINK: + myState.InternalLinkOn = !myState.InternalLinkOn; + if (parameter.size() > 1) { + // '#' character has to stand before link label , so we should omit '#' for getting label + addLink(INTERNAL_HYPERLINK, parameter.substr(1), myState.InternalLinkOn); + } else { + // In case trailing or corrupted tag we use parameter entirely + addLink(INTERNAL_HYPERLINK, parameter, myState.InternalLinkOn); + } + break; + default: + break; + } +} diff --git a/reader/src/formats/pdb/PmlReader.h b/reader/src/formats/pdb/PmlReader.h new file mode 100644 index 0000000..496c8d9 --- /dev/null +++ b/reader/src/formats/pdb/PmlReader.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Information about Palm Markup Language was taken from next sources: + * http://www.m.ereader.com/ereader/help/dropbook/pml.htm + * http://ccit205.wikispaces.com/Palm+Markup+Language+(PML) + */ + +#ifndef __PMLREADER_H__ +#define __PMLREADER_H__ + +#include <string> + +#include <ZLEncodingConverter.h> +#include <ZLTextAlignmentType.h> + +#include "../EncodedTextReader.h" +#include "../../bookmodel/FBTextKind.h" + +class ZLInputStream; + +class PmlReader : public EncodedTextReader { + +public: + virtual bool readDocument(ZLInputStream &stream); + +protected: + PmlReader(const std::string &encoding); + virtual ~PmlReader(); + +protected: + enum FontProperty { + FONT_BOLD, + FONT_ITALIC, + FONT_UNDERLINED, + FONT_SUBSCRIPT, + FONT_SUPERSCRIPT + }; + + enum FontSizeType { + NORMAL, + SMALLER, + LARGER + }; + + + virtual void addCharData(const char *data, std::size_t len, bool convert) = 0; + virtual void addLink(FBTextKind kind, const std::string &id, bool on) = 0; + virtual void addLinkLabel(const std::string &label) = 0; + virtual void addImageReference(const std::string &id) = 0; + virtual void setFontSize() = 0; + virtual void switchFontProperty(FontProperty property) = 0; + virtual void newLine() = 0; + virtual void newPage() = 0; + virtual void newParagraph() = 0; + + void interrupt(); + +private: + bool parseDocument(ZLInputStream &stream); + void processTag(std::string &tagName, const std::string ¶meter = ourDefaultParameter); + void processCharData(const char* data, std::size_t len, bool convert = true); + void processFontProperty(FontProperty property); + void processAlignment(ZLTextAlignmentType alignment); + void processFontSize(FontSizeType sizeType); + void processIndent(const std::string ¶meter =ourDefaultParameter); + void processLink(FBTextKind kind, const std::string ¶meter); + + static std::size_t findTagLength(const char* ptr); + +protected: + struct PmlReaderState { + bool Bold; + bool Italic; + bool Underlined; + bool SmallCaps; + bool Subscript; + bool Superscript; + + ZLTextAlignmentType Alignment; + FontSizeType FontSize; + + unsigned short Indent; + bool IndentBlockOn; + bool BoldBlockOn; + bool FootnoteLinkOn; + bool InternalLinkOn; + bool InvisibleText; + }; + + PmlReaderState myState; + +private: + char* myStreamBuffer; + + bool myIsInterrupted; + const static std::string ourDefaultParameter; +}; + +#endif /* __PMLREADER_H__ */ diff --git a/reader/src/formats/pdb/SimplePdbPlugin.cpp b/reader/src/formats/pdb/SimplePdbPlugin.cpp new file mode 100644 index 0000000..f4b5c30 --- /dev/null +++ b/reader/src/formats/pdb/SimplePdbPlugin.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "PdbPlugin.h" +#include "../txt/TxtBookReader.h" +#include "../html/HtmlBookReader.h" +#include "HtmlMetainfoReader.h" +#include "../util/TextFormatDetector.h" + +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +bool SimplePdbPlugin::readMetaInfo(Book &book) const { + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = createStream(file); + detectEncodingAndLanguage(book, *stream); + if (book.encoding().empty()) { + return false; + } + int readType = HtmlMetainfoReader::NONE; + if (book.title().empty()) { + readType |= HtmlMetainfoReader::TITLE; + } + if (book.authors().empty()) { + readType |= HtmlMetainfoReader::AUTHOR; + } + if ((readType != HtmlMetainfoReader::NONE) && TextFormatDetector().isHtml(*stream)) { + readType |= HtmlMetainfoReader::TAGS; + HtmlMetainfoReader metainfoReader(book, (HtmlMetainfoReader::ReadType)readType); + metainfoReader.readDocument(*stream); + } + + return true; +} + +bool SimplePdbPlugin::readModel(BookModel &model) const { + const Book &book = *model.book(); + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = createStream(file); + + PlainTextFormat format(file); + if (!format.initialized()) { + PlainTextFormatDetector detector; + detector.detect(*stream, format); + } + readDocumentInternal(file, model, format, book.encoding(), *stream); + return true; +} + +void SimplePdbPlugin::readDocumentInternal(const ZLFile&, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { + if (TextFormatDetector().isHtml(stream)) { + HtmlBookReader("", model, format, encoding).readDocument(stream); + } else { + TxtBookReader(model, format, encoding).readDocument(stream); + } +} diff --git a/reader/src/formats/pdb/ZTXTPlugin.cpp b/reader/src/formats/pdb/ZTXTPlugin.cpp new file mode 100644 index 0000000..1465856 --- /dev/null +++ b/reader/src/formats/pdb/ZTXTPlugin.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "PdbPlugin.h" +#include "ZTXTStream.h" +#include "../txt/PlainTextFormat.h" +#include "../util/TextFormatDetector.h" + +bool ZTXTPlugin::providesMetaInfo() const { + return false; +} + +bool ZTXTPlugin::acceptsFile(const ZLFile &file) const { + return PdbPlugin::fileType(file) == "zTXTGPlm"; +} + +shared_ptr<ZLInputStream> ZTXTPlugin::createStream(const ZLFile &file) const { + return new ZTXTStream(file); +} + +FormatInfoPage *ZTXTPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + shared_ptr<ZLInputStream> stream = createStream(file); + return new PlainTextInfoPage(dialog, file, ZLResourceKey("Text"), !TextFormatDetector().isHtml(*stream)); +} diff --git a/reader/src/formats/pdb/ZTXTStream.cpp b/reader/src/formats/pdb/ZTXTStream.cpp new file mode 100644 index 0000000..2dc549c --- /dev/null +++ b/reader/src/formats/pdb/ZTXTStream.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLZDecompressor.h> + +#include "ZTXTStream.h" + +ZTXTStream::ZTXTStream(const ZLFile &file) : PdbStream(file) { +} + +ZTXTStream::~ZTXTStream() { + close(); +} + +bool ZTXTStream::open() { + if (!PdbStream::open()) { + return false; + } + + myBase->seek(2, false); + unsigned short recordNumber; + PdbUtil::readUnsignedShort(*myBase, recordNumber); + myMaxRecordIndex = std::min(recordNumber, (unsigned short)(header().Offsets.size() - 1)); + myBase->seek(4, false); + PdbUtil::readUnsignedShort(*myBase, myMaxRecordSize); + if (myMaxRecordSize == 0) { + return false; + } + myBuffer = new char[myMaxRecordSize]; + + myRecordIndex = 0; + + return true; +} + +bool ZTXTStream::fillBuffer() { + while (myBufferOffset == myBufferLength) { + if (myRecordIndex + 1 > myMaxRecordIndex) { + return false; + } + ++myRecordIndex; + std::size_t currentOffset = recordOffset(myRecordIndex); + // Hmm, this works on examples from manybooks.net, + // but I don't understand what this code means :(( + if (myRecordIndex == 1) { + currentOffset += 2; + } + if (currentOffset < myBase->offset()) { + return false; + } + myBase->seek(currentOffset, true); + const std::size_t nextOffset = recordOffset(myRecordIndex + 1); + if (nextOffset < currentOffset) { + return false; + } + myBufferLength = ZLZDecompressor(nextOffset - currentOffset).decompress(*myBase, myBuffer, myMaxRecordSize); + myBufferOffset = 0; + } + return true; +} diff --git a/reader/src/formats/pdb/ZTXTStream.h b/reader/src/formats/pdb/ZTXTStream.h new file mode 100644 index 0000000..f89d3a0 --- /dev/null +++ b/reader/src/formats/pdb/ZTXTStream.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __ZTXTSTREAM_H__ +#define __ZTXTSTREAM_H__ + +#include <ZLInputStream.h> + +#include "PdbStream.h" + +class ZLFile; + +class ZTXTStream : public PdbStream { + +public: + ZTXTStream(const ZLFile &file); + ~ZTXTStream(); + bool open(); + +private: + bool fillBuffer(); + +private: + std::size_t myMaxRecordIndex; + unsigned short myMaxRecordSize; + std::size_t myRecordIndex; +}; + +#endif /* __ZTXTSTREAM_H__ */ diff --git a/reader/src/formats/pdf/PdfBookReader.cpp b/reader/src/formats/pdf/PdfBookReader.cpp new file mode 100644 index 0000000..bd84452 --- /dev/null +++ b/reader/src/formats/pdf/PdfBookReader.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <iostream> + +#include <ZLStringUtil.h> +#include <ZLInputStream.h> + +#include "PdfBookReader.h" +#include "PdfObject.h" +#include "../../bookmodel/BookModel.h" + +static void readLine(ZLInputStream &stream, std::string &buffer) { + buffer.clear(); + char ch; + while (1) { + if (stream.read(&ch, 1) != 1) { + return; + } + if ((ch == 10) || (ch == 13)) { + if (!buffer.empty()) { + return; + } + } else { + buffer += ch; + } + } +} + +PdfBookReader::PdfBookReader(BookModel &model) : myModelReader(model) { +} + +PdfBookReader::~PdfBookReader() { +} + +shared_ptr<PdfObject> PdfBookReader::readObjectFromLocation(ZLInputStream &stream, const std::pair<int,int> &address) { + std::map<std::pair<int,int>,int>::const_iterator jt = myObjectLocationMap.find(address); + if (jt == myObjectLocationMap.end()) { + return 0; + } + stream.seek(jt->second, true); + char ch = 0; + PdfObject::readToken(stream, myBuffer, ch); + if (address.first != atoi(myBuffer.c_str())) { + return 0; + } + PdfObject::readToken(stream, myBuffer, ch); + if (address.second != atoi(myBuffer.c_str())) { + return 0; + } + PdfObject::readToken(stream, myBuffer, ch); + if (myBuffer != "obj") { + return 0; + } + return PdfObject::readObject(stream, ch); +} + +shared_ptr<PdfObject> PdfBookReader::resolveReference(shared_ptr<PdfObject> ref, ZLInputStream &stream) { + if (ref.isNull() || (ref->type() != PdfObject::REFERENCE)) { + return ref; + } + const PdfObjectReference &reference = (const PdfObjectReference&)*ref; + const std::pair<int,int> address(reference.number(), reference.generation()); + std::map<std::pair<int,int>,shared_ptr<PdfObject> >::const_iterator it = myObjectMap.find(address); + if (it != myObjectMap.end()) { + return it->second; + } + std::map<std::pair<int,int>,int>::const_iterator jt = myObjectLocationMap.find(address); + shared_ptr<PdfObject> object = readObjectFromLocation(stream, address); + myObjectMap.insert(std::make_pair(address, object)); + return object; +} + +static void stripBuffer(std::string &buffer) { + int index = buffer.find('%'); + if (index >= 0) { + buffer.erase(index); + } + ZLStringUtil::stripWhiteSpaces(buffer); +} + +bool PdfBookReader::readReferenceTable(ZLInputStream &stream, int xrefOffset) { + while (true) { + stream.seek(xrefOffset, true); + readLine(stream, myBuffer); + stripBuffer(myBuffer); + if (myBuffer != "xref") { + return false; + } + + while (true) { + readLine(stream, myBuffer); + stripBuffer(myBuffer); + if (myBuffer == "trailer") { + break; + } + const int index = myBuffer.find(' '); + const int start = atoi(myBuffer.c_str()); + const int len = atoi(myBuffer.c_str() + index + 1); + for (int i = 0; i < len; ++i) { + readLine(stream, myBuffer); + stripBuffer(myBuffer); + if (myBuffer.length() != 18) { + return false; + } + const int objectOffset = atoi(myBuffer.c_str()); + const int objectGeneration = atoi(myBuffer.c_str() + 11); + const bool objectInUse = myBuffer[17] == 'n'; + if (objectInUse) { + myObjectLocationMap[std::make_pair(start + i, objectGeneration)] = objectOffset; + } + } + } + char ch = 0; + shared_ptr<PdfObject> trailer = PdfObject::readObject(stream, ch); + if (trailer.isNull() || (trailer->type() != PdfObject::DICTIONARY)) { + return false; + } + if (myTrailer.isNull()) { + myTrailer = trailer; + } + PdfDictionaryObject &trailerDictionary = (PdfDictionaryObject&)*trailer; + shared_ptr<PdfObject> previous = trailerDictionary["Prev"]; + if (previous.isNull()) { + return true; + } + + if (previous->type() != PdfObject::INTEGER_NUMBER) { + return false; + } + xrefOffset = ((PdfIntegerObject&)*previous).value(); + } +} + +bool PdfBookReader::readBook(shared_ptr<ZLInputStream> stream) { + if (stream.isNull() || !stream->open()) { + return false; + } + + readLine(*stream, myBuffer); + if (!ZLStringUtil::stringStartsWith(myBuffer, "%PDF-")) { + return false; + } + + std::string version = myBuffer.substr(5); + std::cerr << "version = " << version << "\n"; + + std::size_t eofOffset = stream->sizeOfOpened(); + if (eofOffset < 100) { + return false; + } + + stream->seek(eofOffset - 100, true); + bool readXrefOffset = false; + std::size_t xrefOffset = (std::size_t)-1; + while (true) { + readLine(*stream, myBuffer); + if (myBuffer.empty()) { + break; + } + stripBuffer(myBuffer); + if (readXrefOffset) { + if (!myBuffer.empty()) { + xrefOffset = atoi(myBuffer.c_str()); + break; + } + } else if (myBuffer == "startxref") { + readXrefOffset = true; + } + } + + if (!readReferenceTable(*stream, xrefOffset)) { + return false; + } + + PdfDictionaryObject &trailerDictionary = (PdfDictionaryObject&)*myTrailer; + shared_ptr<PdfObject> root = resolveReference(trailerDictionary["Root"], *stream); + if (root.isNull() || (root->type() != PdfObject::DICTIONARY)) { + return false; + } + + PdfDictionaryObject &rootDictionary = (PdfDictionaryObject&)*root; + if (rootDictionary["Type"] != PdfNameObject::nameObject("Catalog")) { + return false; + } + shared_ptr<PdfObject> pageRootNode = resolveReference(rootDictionary["Pages"], *stream); + if (pageRootNode.isNull() || (pageRootNode->type() != PdfObject::DICTIONARY)) { + return false; + } + PdfDictionaryObject &pageRootNodeDictionary = (PdfDictionaryObject&)*pageRootNode; + if (pageRootNodeDictionary["Type"] != PdfNameObject::nameObject("Pages")) { + return false; + } + + /* + shared_ptr<PdfObject> count = pageRootNodeDictionary["Count"]; + if (!count.isNull() && (count->type() == PdfObject::INTEGER_NUMBER)) { + std::cerr << "count = " << ((PdfIntegerObject&)*count).value() << "\n"; + } + */ + shared_ptr<PdfObject> pages = pageRootNodeDictionary["Kids"]; + if (pages.isNull() || (pages->type() != PdfObject::ARRAY)) { + return false; + } + const PdfArrayObject& pagesArray = (const PdfArrayObject&)*pages; + const std::size_t pageNumber = pagesArray.size(); + for (std::size_t i = 0; i < pageNumber; ++i) { + processPage(pagesArray[i], *stream); + } + + return true; +} + +void PdfBookReader::processContents(shared_ptr<PdfObject> contentsObject, ZLInputStream &stream) { + contentsObject = resolveReference(contentsObject, stream); +} + +void PdfBookReader::processPage(shared_ptr<PdfObject> pageObject, ZLInputStream &stream) { + pageObject = resolveReference(pageObject, stream); + if (pageObject.isNull() || pageObject->type() != PdfObject::DICTIONARY) { + return; + } + const PdfDictionaryObject &pageDictionary = (const PdfDictionaryObject&)*pageObject; + shared_ptr<PdfObject> contents = pageDictionary["Contents"]; + if (contents.isNull()) { + return; + } + switch (contents->type()) { + default: + break; + case PdfObject::REFERENCE: + processContents(contents, stream); + break; + case PdfObject::ARRAY: + { + const PdfArrayObject &array = (const PdfArrayObject&)*contents; + const std::size_t len = array.size(); + for (std::size_t i = 0; i < len; ++i) { + processContents(array[i], stream); + } + break; + } + } +} diff --git a/reader/src/formats/pdf/PdfBookReader.h b/reader/src/formats/pdf/PdfBookReader.h new file mode 100644 index 0000000..9488dcf --- /dev/null +++ b/reader/src/formats/pdf/PdfBookReader.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PdfBOOKREADER_H__ +#define __PdfBOOKREADER_H__ + +#include <map> + +#include "../../bookmodel/BookReader.h" + +class PdfObject; +class PdfObjectReference; + +class PdfBookReader { + +public: + PdfBookReader(BookModel &model); + ~PdfBookReader(); + bool readBook(shared_ptr<ZLInputStream> stream); + +private: + bool readReferenceTable(ZLInputStream &stream, int offset); + shared_ptr<PdfObject> resolveReference(shared_ptr<PdfObject> reference, ZLInputStream &stream); + shared_ptr<PdfObject> readObjectFromLocation(ZLInputStream &stream, const std::pair<int,int> &address); + void processPage(shared_ptr<PdfObject> pageObject, ZLInputStream &stream); + void processContents(shared_ptr<PdfObject> contentsObject, ZLInputStream &stream); + +private: + BookReader myModelReader; + std::string myBuffer; + std::map<std::pair<int,int>,int> myObjectLocationMap; + std::map<std::pair<int,int>,shared_ptr<PdfObject> > myObjectMap; + shared_ptr<PdfObject> myTrailer; +}; + +#endif /* __PdfBOOKREADER_H__ */ diff --git a/reader/src/formats/pdf/PdfDescriptionReader.cpp b/reader/src/formats/pdf/PdfDescriptionReader.cpp new file mode 100644 index 0000000..98937fa --- /dev/null +++ b/reader/src/formats/pdf/PdfDescriptionReader.cpp @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "PdfDescriptionReader.h" + +PdfDescriptionReader::PdfDescriptionReader(Book &book) : myBook(book) { +} + +bool PdfDescriptionReader::readMetaInfo(shared_ptr<ZLInputStream> stream) { + return true; +} diff --git a/reader/src/formats/pdf/PdfDescriptionReader.h b/reader/src/formats/pdf/PdfDescriptionReader.h new file mode 100644 index 0000000..004cdfa --- /dev/null +++ b/reader/src/formats/pdf/PdfDescriptionReader.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PDFDESCRIPTIONREADER_H__ +#define __PDFDESCRIPTIONREADER_H__ + +#include <string> + +class Book; + +class PdfDescriptionReader { + +public: + PdfDescriptionReader(Book &book); + ~PdfDescriptionReader(); + bool readMetaInfo(shared_ptr<ZLInputStream> stream); + +private: + Book &myBook; +}; + +inline PdfDescriptionReader::~PdfDescriptionReader() {} + +#endif /* __PDFDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/pdf/PdfObject.cpp b/reader/src/formats/pdf/PdfObject.cpp new file mode 100644 index 0000000..374a618 --- /dev/null +++ b/reader/src/formats/pdf/PdfObject.cpp @@ -0,0 +1,450 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <iostream> + +#include <ZLInputStream.h> +#include <ZLZDecompressor.h> + +#include "PdfObject.h" + +PdfObject::~PdfObject() { +} + +shared_ptr<PdfObject> PdfIntegerObject::integerObject(int value) { + if ((value < 0) || (value >= 256)) { + return new PdfIntegerObject(value); + } else { + static shared_ptr<PdfObject>* table = new shared_ptr<PdfObject>[256]; + if (table[value].isNull()) { + table[value] = new PdfIntegerObject(value); + } + return table[value]; + } +} + +PdfIntegerObject::PdfIntegerObject(int value) : myValue(value) { + std::cerr << "PdfIntegerObject " << value << "\n"; +} + +int PdfIntegerObject::value() const { + return myValue; +} + +PdfObject::Type PdfIntegerObject::type() const { + return INTEGER_NUMBER; +} + +shared_ptr<PdfObject> PdfBooleanObject::TRUE() { + static shared_ptr<PdfObject> value = new PdfBooleanObject(true); + return value; +} + +shared_ptr<PdfObject> PdfBooleanObject::FALSE() { + static shared_ptr<PdfObject> value = new PdfBooleanObject(false); + return value; +} + +PdfBooleanObject::PdfBooleanObject(bool value) : myValue(value) { + std::cerr << "PdfBooleanObject " << value << "\n"; +} + +bool PdfBooleanObject::value() const { + return myValue; +} + +PdfObject::Type PdfBooleanObject::type() const { + return BOOLEAN; +} + +PdfStringObject::PdfStringObject(const std::string &value) : myValue(value) { + std::cerr << "PdfStringObject " << value << "\n"; +} + +PdfObject::Type PdfStringObject::type() const { + return STRING; +} + +std::map<std::string,shared_ptr<PdfObject> > PdfNameObject::ourObjectMap; + +shared_ptr<PdfObject> PdfNameObject::nameObject(const std::string &id) { + // TODO: process escaped characters + std::map<std::string,shared_ptr<PdfObject> >::const_iterator it = ourObjectMap.find(id); + if (it != ourObjectMap.end()) { + return it->second; + } + std::cerr << "PdfNameObject " << id << "\n"; + shared_ptr<PdfObject> object = new PdfNameObject(); + ourObjectMap.insert(std::make_pair(id, object)); + return object; +} + +PdfNameObject::PdfNameObject() { +} + +PdfObject::Type PdfNameObject::type() const { + return NAME; +} + +PdfDictionaryObject::PdfDictionaryObject() { +} + +void PdfDictionaryObject::setObject(shared_ptr<PdfObject> id, shared_ptr<PdfObject> object) { + myMap[id] = object; +} + +shared_ptr<PdfObject> PdfDictionaryObject::operator[](shared_ptr<PdfObject> id) const { + std::map<shared_ptr<PdfObject>,shared_ptr<PdfObject> >::const_iterator it = myMap.find(id); + return (it != myMap.end()) ? it->second : 0; +} + +shared_ptr<PdfObject> PdfDictionaryObject::operator[](const std::string &id) const { + return operator[](PdfNameObject::nameObject(id)); +} + +PdfObject::Type PdfDictionaryObject::type() const { + return DICTIONARY; +} + +PdfArrayObject::PdfArrayObject() { +} + +void PdfArrayObject::addObject(shared_ptr<PdfObject> object) { + myVector.push_back(object); +} + +shared_ptr<PdfObject> PdfArrayObject::popLast() { + if (!myVector.empty()) { + shared_ptr<PdfObject> last = myVector.back(); + myVector.pop_back(); + return last; + } + return 0; +} + +int PdfArrayObject::size() const { + return myVector.size(); +} + +shared_ptr<PdfObject> PdfArrayObject::operator[](int index) const { + return myVector[index]; +} + +PdfObject::Type PdfArrayObject::type() const { + return ARRAY; +} + +PdfObjectReference::PdfObjectReference(int number, int generation) : myNumber(number), myGeneration(generation) { +} + +int PdfObjectReference::number() const { + return myNumber; +} + +int PdfObjectReference::generation() const { + return myGeneration; +} + +PdfObject::Type PdfObjectReference::type() const { + return REFERENCE; +} + +PdfStreamObject::PdfStreamObject(const PdfDictionaryObject &dictionary, ZLInputStream &dataStream) { + char ch; + skipWhiteSpaces(dataStream, ch); + + shared_ptr<PdfObject> length = dictionary["Length"]; + if (!length.isNull() && (length->type() == INTEGER_NUMBER)) { + int value = ((PdfIntegerObject&)*length).value(); + if (value > 0) { + shared_ptr<PdfObject> filter = dictionary["Filter"]; + if (filter == PdfNameObject::nameObject("FlateDecode")) { + dataStream.seek(1, false); + ZLZDecompressor decompressor(value - 2); + char buffer[2048]; + while (true) { + std::size_t size = decompressor.decompress(dataStream, buffer, 2048); + if (size == 0) { + break; + } + myData.append(buffer, size); + } + std::cerr << myData << "\n"; + } else { + myData.append(value, '\0'); + myData[0] = ch; + dataStream.read((char*)myData.data() + 1, value - 1); + } + } + } + + /* + shared_ptr<PdfObject> filter = dictionary["Filter"]; + if (!filter.isNull()) { + switch (filter->type()) { + default: + break; + case NAME: + myFilters.push_back( + (filter == PdfNameObject::nameObject("FlateDecode")) ? + FLATE : UNKNOWN + ); + break; + case ARRAY: + { + // TODO: process filters array + } + } + } + */ +} + +PdfObject::Type PdfStreamObject::type() const { + return STREAM; +} + +enum PdfCharacterType { + PDF_CHAR_REGULAR, + PDF_CHAR_WHITESPACE, + PDF_CHAR_DELIMITER +}; + +static PdfCharacterType *PdfCharacterTypeTable = 0; + +void PdfObject::skipWhiteSpaces(ZLInputStream &stream, char &ch) { + if (PdfCharacterTypeTable == 0) { + PdfCharacterTypeTable = new PdfCharacterType[256]; + for (int i = 0; i < 256; ++i) { + PdfCharacterTypeTable[i] = PDF_CHAR_REGULAR; + } + PdfCharacterTypeTable[0] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable[9] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable[10] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable[12] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable[13] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable[32] = PDF_CHAR_WHITESPACE; + PdfCharacterTypeTable['('] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable[')'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['<'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['>'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['['] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable[']'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['{'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['}'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['/'] = PDF_CHAR_DELIMITER; + PdfCharacterTypeTable['%'] = PDF_CHAR_DELIMITER; + } + + while ((PdfCharacterTypeTable[(unsigned char)ch] == PDF_CHAR_WHITESPACE) && + (stream.read(&ch, 1) == 1)) { + } +} + +void PdfObject::readToken(ZLInputStream &stream, std::string &buffer, char &ch) { + buffer.clear(); + skipWhiteSpaces(stream, ch); + while (PdfCharacterTypeTable[(unsigned char)ch] == PDF_CHAR_REGULAR) { + buffer += ch; + if (stream.read(&ch, 1) != 1) { + break; + } + } +} + +shared_ptr<PdfObject> PdfObject::readObject(ZLInputStream &stream, char &ch) { + skipWhiteSpaces(stream, ch); + + PdfObject::Type type = PdfObject::NIL; + bool hexString = false; + switch (ch) { + case '(': + hexString = false; + type = PdfObject::STRING; + break; + case '<': + stream.read(&ch, 1); + hexString = true; + type = (ch == '<') ? PdfObject::DICTIONARY : PdfObject::STRING; + break; + case '>': // end of dictionary + stream.read(&ch, 1); + if (ch == '>') { + stream.read(&ch, 1); + } + return 0; + case '/': + type = PdfObject::NAME; + break; + case '[': + type = PdfObject::ARRAY; + break; + case ']': // end of array + stream.read(&ch, 1); + return 0; + case '+': + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + type = PdfObject::INTEGER_NUMBER; + break; + case 't': + case 'f': + type = PdfObject::BOOLEAN; + break; + } + + switch (type) { + case PdfObject::DICTIONARY: + { + ch = 0; + shared_ptr<PdfObject> name; + shared_ptr<PdfObject> value; + shared_ptr<PdfObject> next; + PdfDictionaryObject *dictionary = new PdfDictionaryObject(); + while (true) { + next = readObject(stream, ch); + if (next.isNull()) { + break; + } + PdfObject::Type oType = next->type(); + if (oType == PdfObject::NAME) { + name = next; + value = readObject(stream, ch); + if (value.isNull()) { + break; + } + dictionary->setObject(name, value); + } else if (oType == PdfObject::INTEGER_NUMBER) { + if (value.isNull() || (value->type() != PdfObject::INTEGER_NUMBER)) { + break; + } + skipWhiteSpaces(stream, ch); + if (ch != 'R') { + break; + } + const int number = ((PdfIntegerObject&)*value).value(); + const int generation = ((PdfIntegerObject&)*next).value(); + dictionary->setObject(name, new PdfObjectReference(number, generation)); + value = 0; + ch = 0; + } else { + break; + } + } + std::string token; + readToken(stream, token, ch); + if (token == "stream") { + shared_ptr<PdfObject> d = dictionary; + return new PdfStreamObject(*dictionary, stream); + } else { + return dictionary; + } + } + case PdfObject::NAME: + { + std::string name; + stream.read(&ch, 1); + readToken(stream, name, ch); + return PdfNameObject::nameObject(name); + } + case PdfObject::BOOLEAN: + { + std::string name; + readToken(stream, name, ch); + return (name == "true") ? PdfBooleanObject::TRUE() : PdfBooleanObject::FALSE(); + } + case PdfObject::INTEGER_NUMBER: + { + std::string str; + if ((ch == '+') || (ch == '-')) { + str += ch; + stream.read(&ch, 1); + } + while ((ch >= '0') && (ch <= '9')) { + str += ch; + stream.read(&ch, 1); + } + return PdfIntegerObject::integerObject(atoi(str.c_str())); + } + case PdfObject::STRING: + { + std::string value; + if (hexString) { + char num[3]; + num[2] = '\0'; + while (ch != '>') { + num[0] = ch; + stream.read(num + 1, 1); + value += (char)strtol(num, 0, 16); + stream.read(&ch, 1); + } + ch = 0; + } else { + // TODO: implement + } + return new PdfStringObject(value); + } + case PdfObject::ARRAY: + { + PdfArrayObject *array = new PdfArrayObject(); + ch = 0; + while (true) { + skipWhiteSpaces(stream, ch); + if (ch == 'R') { + const int size = array->size(); + if ((size >= 2) && + ((*array)[size - 1]->type() == PdfObject::INTEGER_NUMBER) && + ((*array)[size - 2]->type() == PdfObject::INTEGER_NUMBER)) { + const int generation = ((PdfIntegerObject&)*array->popLast()).value(); + const int number = ((PdfIntegerObject&)*array->popLast()).value(); + array->addObject(new PdfObjectReference(number, generation)); + ch = 0; + } + } + shared_ptr<PdfObject> object = readObject(stream, ch); + if (object.isNull()) { + break; + } + array->addObject(object); + } + std::cerr << "PdfArrayObject " << array->size() << "\n"; + return array; + } + default: + break; + } + + std::string buffer; + stream.read(&ch, 1); + while (PdfCharacterTypeTable[(unsigned char)ch] == PDF_CHAR_REGULAR) { + buffer += ch; + stream.read(&ch, 1); + } + std::cerr << "buffer = " << buffer << "\n"; + + return 0; +} diff --git a/reader/src/formats/pdf/PdfObject.h b/reader/src/formats/pdf/PdfObject.h new file mode 100644 index 0000000..76b8528 --- /dev/null +++ b/reader/src/formats/pdf/PdfObject.h @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PDFOBJECT_H__ +#define __PDFOBJECT_H__ + +#include <string> +#include <vector> +#include <map> + +#include <shared_ptr.h> + +class ZLInputStream; + +class PdfObject { + +public: + static shared_ptr<PdfObject> readObject(ZLInputStream &stream, char &ch); + static void readToken(ZLInputStream &stream, std::string &buffer, char &ch); + +protected: + static void skipWhiteSpaces(ZLInputStream &stream, char &ch); + +public: + enum Type { + BOOLEAN, + INTEGER_NUMBER, + REAL_NUMBER, + STRING, + NAME, + ARRAY, + DICTIONARY, + STREAM, + NIL, + REFERENCE + }; + + virtual ~PdfObject(); + + virtual Type type() const = 0; +}; + +class PdfBooleanObject : public PdfObject { + +public: + static shared_ptr<PdfObject> TRUE(); + static shared_ptr<PdfObject> FALSE(); + +private: + PdfBooleanObject(bool value); + +public: + bool value() const; + +private: + Type type() const; + +private: + const bool myValue; +}; + +class PdfIntegerObject : public PdfObject { + +public: + static shared_ptr<PdfObject> integerObject(int value); + +private: + PdfIntegerObject(int value); + +public: + int value() const; + +private: + Type type() const; + +private: + const int myValue; +}; + +class PdfStringObject : public PdfObject { + +private: + PdfStringObject(const std::string &value); + +private: + Type type() const; + +private: + std::string myValue; + +friend shared_ptr<PdfObject> PdfObject::readObject(ZLInputStream &stream, char &ch); +}; + +class PdfNameObject : public PdfObject { + +public: + static shared_ptr<PdfObject> nameObject(const std::string &id); + +private: + static std::map<std::string,shared_ptr<PdfObject> > ourObjectMap; + +private: + PdfNameObject(); + +private: + Type type() const; +}; + +class PdfDictionaryObject : public PdfObject { + +private: + PdfDictionaryObject(); + void setObject(shared_ptr<PdfObject> id, shared_ptr<PdfObject> object); + +public: + shared_ptr<PdfObject> operator [] (shared_ptr<PdfObject> id) const; + shared_ptr<PdfObject> operator [] (const std::string &id) const; + +private: + Type type() const; + +private: + std::map<shared_ptr<PdfObject>,shared_ptr<PdfObject> > myMap; + +friend shared_ptr<PdfObject> PdfObject::readObject(ZLInputStream &stream, char &ch); +}; + +class PdfStreamObject : public PdfObject { + +private: + PdfStreamObject(const PdfDictionaryObject &dictionary, ZLInputStream &dataStream); + +private: + Type type() const; + +private: + std::string myData; + /* + enum EncodingType { + UNKNOWN, + FLATE, + }; + std::vector<EncodingType> myFilters; + */ + +friend shared_ptr<PdfObject> PdfObject::readObject(ZLInputStream &stream, char &ch); +}; + +class PdfArrayObject : public PdfObject { + +private: + PdfArrayObject(); + void addObject(shared_ptr<PdfObject> object); + shared_ptr<PdfObject> popLast(); + +public: + int size() const; + shared_ptr<PdfObject> operator [] (int index) const; + +private: + Type type() const; + +private: + std::vector<shared_ptr<PdfObject> > myVector; + +friend shared_ptr<PdfObject> PdfObject::readObject(ZLInputStream &stream, char &ch); +}; + +class PdfObjectReference : public PdfObject { + +public: + PdfObjectReference(int number, int generation); + + int number() const; + int generation() const; + +private: + Type type() const; + +private: + const int myNumber; + const int myGeneration; +}; + +#endif /* __PDFOBJECT_H__ */ diff --git a/reader/src/formats/pdf/PdfPlugin.cpp b/reader/src/formats/pdf/PdfPlugin.cpp new file mode 100644 index 0000000..06325d4 --- /dev/null +++ b/reader/src/formats/pdf/PdfPlugin.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "PdfPlugin.h" +#include "PdfDescriptionReader.h" +#include "PdfBookReader.h" +#include "../../library/Book.h" + +bool PdfPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "pdf"; +} + +bool PdfPlugin::readMetaInfo(Book &book) const { + return PdfDescriptionReader(book).readMetaInfo(ZLFile(path).inputStream()); +} + +bool PdfPlugin::readLanguageAndEncoding(Book &book) const { + return true; +} + +bool PdfPlugin::readModel(BookModel &model) const { + return PdfBookReader(model).readBook(ZLFile(book.fileName()).inputStream()); +} diff --git a/reader/src/formats/pdf/PdfPlugin.h b/reader/src/formats/pdf/PdfPlugin.h new file mode 100644 index 0000000..9c330f6 --- /dev/null +++ b/reader/src/formats/pdf/PdfPlugin.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PdfPLUGIN_H__ +#define __PdfPLUGIN_H__ + +#include "../FormatPlugin.h" + +class PdfPlugin : public FormatPlugin { + +public: + PdfPlugin(); + ~PdfPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +inline PdfPlugin::PdfPlugin() {} +inline PdfPlugin::~PdfPlugin() {} +inline bool PdfPlugin::providesMetaInfo() const { return true; } + +#endif /* __PdfPLUGIN_H__ */ diff --git a/reader/src/formats/pdf/StringStream.cpp b/reader/src/formats/pdf/StringStream.cpp new file mode 100644 index 0000000..b2369df --- /dev/null +++ b/reader/src/formats/pdf/StringStream.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <algorithm> + +#include "StringStream.h" + +StringStream::StringStream(const std::string &data) : myData(data), myOffset(0) { +} + +bool StringStream::open() { + myOffset = 0; + return true; +} + +std::size_t StringStream::read(char *buffer, std::size_t maxSize) { + std::size_t size = std::min(maxSize, myData.length() - myOffset); + memcpy(buffer, myData.data() + myOffset, size); + myOffset += size; + return size; +} + +void StringStream::close() { +} + +void StringStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min((std::size_t)std::max(0, offset), myData.length()); +} + +std::size_t StringStream::offset() const { + return myOffset; +} + +std::size_t StringStream::sizeOfOpened() { + return myData.length(); +} diff --git a/reader/src/formats/pdf/StringStream.h b/reader/src/formats/pdf/StringStream.h new file mode 100644 index 0000000..f46c038 --- /dev/null +++ b/reader/src/formats/pdf/StringStream.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __STRINGSTREAM_H__ +#define __STRINGSTREAM_H__ + +#include <ZLInputStream.h> + +class StringStream : public ZLInputStream { + +public: + StringStream(const std::string &data); + +public: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + const std::string &myData; + std::size_t myOffset; +}; + +#endif /* __STRINGSTREAM_H__ */ diff --git a/reader/src/formats/rtf/RtfBookReader.cpp b/reader/src/formats/rtf/RtfBookReader.cpp new file mode 100644 index 0000000..cf16bc7 --- /dev/null +++ b/reader/src/formats/rtf/RtfBookReader.cpp @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include <ZLStringUtil.h> +#include <ZLFileImage.h> +#include <ZLTextStyleEntry.h> + +#include "RtfBookReader.h" +#include "../../bookmodel/BookModel.h" + +RtfBookReader::RtfBookReader(BookModel &model, const std::string &encoding) : RtfReader(encoding), myBookReader(model) { +} + +static const std::size_t maxBufferSize = 1024; + +void RtfBookReader::addCharData(const char *data, std::size_t len, bool convert) { + if (myCurrentState.ReadText) { + if (convert || myConverter.isNull()) { + myOutputBuffer.append(data, len); + if (myOutputBuffer.size() >= maxBufferSize) { + flushBuffer(); + } + } else { + flushBuffer(); + std::string newString(data, len); + characterDataHandler(newString); + } + } +} + +void RtfBookReader::flushBuffer() { + if (!myOutputBuffer.empty()) { + if (myCurrentState.ReadText) { + if (!myConverter.isNull()) { + static std::string newString; + myConverter->convert(newString, myOutputBuffer.data(), myOutputBuffer.data() + myOutputBuffer.length()); + characterDataHandler(newString); + newString.erase(); + } else { + characterDataHandler(myOutputBuffer); + } + } + myOutputBuffer.erase(); + } +} + +void RtfBookReader::switchDestination(DestinationType destination, bool on) { + switch (destination) { + case DESTINATION_NONE: + break; + case DESTINATION_SKIP: + case DESTINATION_INFO: + case DESTINATION_TITLE: + case DESTINATION_AUTHOR: + case DESTINATION_STYLESHEET: + myCurrentState.ReadText = !on; + break; + case DESTINATION_PICTURE: + if (on) { + flushBuffer(); + if (myBookReader.paragraphIsOpen()) { + myBookReader.endParagraph(); + } + } + myCurrentState.ReadText = !on; + break; + case DESTINATION_FOOTNOTE: + flushBuffer(); + if (on) { + std::string id; + ZLStringUtil::appendNumber(id, myFootnoteIndex++); + + myStateStack.push(myCurrentState); + myCurrentState.Id = id; + myCurrentState.ReadText = true; + + myBookReader.addHyperlinkControl(FOOTNOTE, id); + myBookReader.addData(id); + myBookReader.addControl(FOOTNOTE, false); + + myBookReader.setFootnoteTextModel(id); + myBookReader.addHyperlinkLabel(id); + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + } else { + myBookReader.endParagraph(); + myBookReader.popKind(); + + if (!myStateStack.empty()) { + myCurrentState = myStateStack.top(); + myStateStack.pop(); + } + + if (myStateStack.empty()) { + myBookReader.setMainTextModel(); + } else { + myBookReader.setFootnoteTextModel(myCurrentState.Id); + } + } + break; + } +} + +void RtfBookReader::insertImage(shared_ptr<ZLMimeType> mimeType, const std::string &fileName, std::size_t startOffset, std::size_t size) { + std::string id; + ZLStringUtil::appendNumber(id, myImageIndex++); + myBookReader.addImageReference(id); + const ZLFile file(fileName, mimeType); + myBookReader.addImage(id, new ZLFileImage(file, startOffset, size, ZLFileImage::ENCODING_HEX)); +} + +bool RtfBookReader::characterDataHandler(std::string &str) { + if (myCurrentState.ReadText) { + if (!myBookReader.paragraphIsOpen()) { + myBookReader.beginParagraph(); + } + myBookReader.addData(str); + } + return true; +} + +bool RtfBookReader::readDocument(const ZLFile &file) { + myImageIndex = 0; + myFootnoteIndex = 1; + + myCurrentState.ReadText = true; + + myBookReader.setMainTextModel(); + myBookReader.pushKind(REGULAR); + myBookReader.beginParagraph(); + + bool code = RtfReader::readDocument(file); + + flushBuffer(); + myBookReader.endParagraph(); + while (!myStateStack.empty()) { + myStateStack.pop(); + } + + return code; +} + +void RtfBookReader::setFontProperty(FontProperty property) { + if (!myCurrentState.ReadText) { + //DPRINT("change style not in text.\n"); + return; + } + flushBuffer(); + + switch (property) { + case FONT_BOLD: + if (myState.Bold) { + myBookReader.pushKind(STRONG); + } else { + myBookReader.popKind(); + } + myBookReader.addControl(STRONG, myState.Bold); + break; + case FONT_ITALIC: + if (myState.Italic) { + if (!myState.Bold) { + //DPRINT("add style emphasis.\n"); + myBookReader.pushKind(EMPHASIS); + myBookReader.addControl(EMPHASIS, true); + } else { + //DPRINT("add style emphasis and strong.\n"); + myBookReader.popKind(); + myBookReader.addControl(STRONG, false); + + myBookReader.pushKind(EMPHASIS); + myBookReader.addControl(EMPHASIS, true); + myBookReader.pushKind(STRONG); + myBookReader.addControl(STRONG, true); + } + } else { + if (!myState.Bold) { + //DPRINT("remove style emphasis.\n"); + myBookReader.addControl(EMPHASIS, false); + myBookReader.popKind(); + } else { + //DPRINT("remove style strong n emphasis, add strong.\n"); + myBookReader.addControl(STRONG, false); + myBookReader.popKind(); + myBookReader.addControl(EMPHASIS, false); + myBookReader.popKind(); + + myBookReader.pushKind(STRONG); + myBookReader.addControl(STRONG, true); + } + } + break; + case FONT_UNDERLINED: + break; + } +} + +void RtfBookReader::newParagraph() { + flushBuffer(); + myBookReader.endParagraph(); + myBookReader.beginParagraph(); + if (myState.Alignment != ALIGN_UNDEFINED) { + setAlignment(); + } +} + +void RtfBookReader::setEncoding(int) { +} + +void RtfBookReader::setAlignment() { + ZLTextStyleEntry entry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + entry.setAlignmentType(myState.Alignment); + myBookReader.addStyleEntry(entry); + // TODO: call addStyleCloseEntry somewhere (?) +} diff --git a/reader/src/formats/rtf/RtfBookReader.h b/reader/src/formats/rtf/RtfBookReader.h new file mode 100644 index 0000000..a977cbd --- /dev/null +++ b/reader/src/formats/rtf/RtfBookReader.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __RTFBOOKREADER_H__ +#define __RTFBOOKREADER_H__ + +#include <vector> + +#include "RtfReader.h" +#include "../../bookmodel/BookReader.h" + +class ZLFile; + +class BookModel; + +class RtfBookReader : public RtfReader { + +public: + RtfBookReader(BookModel &model, const std::string &encoding); + ~RtfBookReader(); + + bool readDocument(const ZLFile &file); + + bool characterDataHandler(std::string &str); + void flushBuffer(); + + void setEncoding(int code); + void setAlignment(); + void switchDestination(DestinationType destination, bool on); + void addCharData(const char *data, std::size_t len, bool convert); + void insertImage(shared_ptr<ZLMimeType> mimeType, const std::string &fileName, std::size_t startOffset, std::size_t size); + + void setFontProperty(FontProperty property); + void newParagraph(); + +private: + BookReader myBookReader; + + std::string myOutputBuffer; + + int myImageIndex; + int myFootnoteIndex; + + struct RtfBookReaderState { + std::string Id; + bool ReadText; + }; + + RtfBookReaderState myCurrentState; + std::stack<RtfBookReaderState> myStateStack; +}; + +inline RtfBookReader::~RtfBookReader() {} + +#endif /* __RTFBOOKREADER_H__ */ diff --git a/reader/src/formats/rtf/RtfDescriptionReader.cpp b/reader/src/formats/rtf/RtfDescriptionReader.cpp new file mode 100644 index 0000000..571e66b --- /dev/null +++ b/reader/src/formats/rtf/RtfDescriptionReader.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLInputStream.h> + +#include "RtfDescriptionReader.h" + +#include "../FormatPlugin.h" +#include "../../library/Book.h" +#include "../../library/Author.h" + +RtfDescriptionReader::RtfDescriptionReader(Book &book) : RtfReader(book.encoding()), myBook(book) { +} + +void RtfDescriptionReader::setEncoding(int code) { + ZLEncodingCollection &collection = ZLEncodingCollection::Instance(); + ZLEncodingConverterInfoPtr info = collection.info(code); + if (!info.isNull()) { + myConverter = info->createConverter(); + myBook.setEncoding(info->name()); + } else { + myConverter = collection.defaultConverter(); + } +} + +bool RtfDescriptionReader::readDocument(const ZLFile &file) { + myDoRead = false; + bool code = RtfReader::readDocument(file); + if (myBook.encoding().empty()) { + myBook.setEncoding(PluginCollection::Instance().DefaultEncodingOption.value()); + } + return code; +} + +void RtfDescriptionReader::addCharData(const char *data, std::size_t len, bool convert) { + if (myDoRead && len > 0) { + if (convert) { + myConverter->convert(myBuffer, data, data + len); + } else { + myBuffer.append(data, len); + } + } +} + +void RtfDescriptionReader::switchDestination(DestinationType destination, bool on) { + switch (destination) { + case DESTINATION_INFO: + if (!on) { + interrupt(); + } + break; + case DESTINATION_TITLE: + myDoRead = on; + if (!on) { + myBook.setTitle(myBuffer); + myBuffer.erase(); + } + break; + case DESTINATION_AUTHOR: + myDoRead = on; + if (!on) { + myBook.addAuthor(myBuffer); + myBuffer.erase(); + } + break; + default: + break; + } + if (!myBook.title().empty() && !myBook.authors().empty() && !myBook.encoding().empty()) { + interrupt(); + } +} + +void RtfDescriptionReader::insertImage(shared_ptr<ZLMimeType>, const std::string&, std::size_t, std::size_t) { +} + +void RtfDescriptionReader::setFontProperty(FontProperty) { +} + +void RtfDescriptionReader::newParagraph() { +} + +void RtfDescriptionReader::setAlignment() { +} diff --git a/reader/src/formats/rtf/RtfDescriptionReader.h b/reader/src/formats/rtf/RtfDescriptionReader.h new file mode 100644 index 0000000..ff4ffa1 --- /dev/null +++ b/reader/src/formats/rtf/RtfDescriptionReader.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __RTFDESCRIPTIONREADER_H__ +#define __RTFDESCRIPTIONREADER_H__ + +#include <string> + +#include "RtfReader.h" + +class Book; + +class RtfDescriptionReader : public RtfReader { + +public: + RtfDescriptionReader(Book &book); + ~RtfDescriptionReader(); + + bool readDocument(const ZLFile &file); + + void setEncoding(int code); + void setAlignment(); + void switchDestination(DestinationType destination, bool on); + void addCharData(const char *data, std::size_t len, bool convert); + void insertImage(shared_ptr<ZLMimeType> mimeType, const std::string &fileName, std::size_t startOffset, std::size_t size); + + void setFontProperty(FontProperty property); + void newParagraph(); + +private: + Book &myBook; + + bool myDoRead; + std::string myBuffer; +}; + +inline RtfDescriptionReader::~RtfDescriptionReader() {} + +#endif /* __RTFDESCRIPTIONREADER_H__ */ diff --git a/reader/src/formats/rtf/RtfPlugin.cpp b/reader/src/formats/rtf/RtfPlugin.cpp new file mode 100644 index 0000000..42ce39b --- /dev/null +++ b/reader/src/formats/rtf/RtfPlugin.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLStringUtil.h> +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "RtfPlugin.h" +#include "RtfDescriptionReader.h" +#include "RtfBookReader.h" +#include "RtfReaderStream.h" + +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +bool RtfPlugin::providesMetaInfo() const { + return false; +} + +bool RtfPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "rtf"; +} + +bool RtfPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = new RtfReaderStream(book.file(), 50000); + + if (stream.isNull()) { + return false; + } + + detectEncodingAndLanguage(book, *stream); + + if (!RtfDescriptionReader(book).readDocument(book.file())) { + return false; + } + + return true; +} + +bool RtfPlugin::readModel(BookModel &model) const { + const Book &book = *model.book(); + return RtfBookReader(model, book.encoding()).readDocument(book.file()); +} +bool RtfPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} diff --git a/reader/src/formats/rtf/RtfPlugin.h b/reader/src/formats/rtf/RtfPlugin.h new file mode 100644 index 0000000..cb3ef9d --- /dev/null +++ b/reader/src/formats/rtf/RtfPlugin.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __RTFPLUGIN_H__ +#define __RTFPLUGIN_H__ + +#include "../FormatPlugin.h" + +class RtfPlugin : public FormatPlugin { + +public: + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; +}; + +#endif /* __RTFPLUGIN_H__ */ diff --git a/reader/src/formats/rtf/RtfReader.cpp b/reader/src/formats/rtf/RtfReader.cpp new file mode 100644 index 0000000..91fea0c --- /dev/null +++ b/reader/src/formats/rtf/RtfReader.cpp @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> +#include <cctype> + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "RtfReader.h" + +std::map<std::string, RtfCommand*> RtfReader::ourKeywordMap; + +static const int rtfStreamBufferSize = 4096; + +RtfReader::RtfReader(const std::string &encoding) : EncodedTextReader(encoding) { + myNextImageMimeType = ZLMimeType::EMPTY; +} + +RtfReader::~RtfReader() { +} + +RtfCommand::~RtfCommand() { +} + +void RtfDummyCommand::run(RtfReader&, int*) const { +} + +void RtfNewParagraphCommand::run(RtfReader &reader, int*) const { + reader.newParagraph(); +} + +RtfFontPropertyCommand::RtfFontPropertyCommand(RtfReader::FontProperty property) : myProperty(property) { +} + +void RtfFontPropertyCommand::run(RtfReader &reader, int *parameter) const { + const bool start = (parameter == 0) || (*parameter != 0); + switch (myProperty) { + case RtfReader::FONT_BOLD: + if (reader.myState.Bold != start) { + reader.myState.Bold = start; + reader.setFontProperty(RtfReader::FONT_BOLD); + } + break; + case RtfReader::FONT_ITALIC: + if (reader.myState.Italic != start) { + reader.myState.Italic = start; + reader.setFontProperty(RtfReader::FONT_ITALIC); + } + break; + case RtfReader::FONT_UNDERLINED: + if (reader.myState.Underlined != start) { + reader.myState.Underlined = start; + reader.setFontProperty(RtfReader::FONT_UNDERLINED); + } + break; + } +} + +RtfAlignmentCommand::RtfAlignmentCommand(ZLTextAlignmentType alignment) : myAlignment(alignment) { +} + +void RtfAlignmentCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Alignment != myAlignment) { + reader.myState.Alignment = myAlignment; + reader.setAlignment(); + } +} + +RtfCharCommand::RtfCharCommand(const std::string &chr) : myChar(chr) { +} + +void RtfCharCommand::run(RtfReader &reader, int*) const { + reader.processCharData(myChar.data(), myChar.length(), false); +} + +RtfDestinationCommand::RtfDestinationCommand(RtfReader::DestinationType destination) : myDestination(destination) { +} + +void RtfDestinationCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Destination == myDestination) { + return; + } + reader.myState.Destination = myDestination; + if (myDestination == RtfReader::DESTINATION_PICTURE) { + reader.myState.ReadDataAsHex = true; + reader.myNextImageMimeType = ZLMimeType::EMPTY; + } + reader.switchDestination(myDestination, true); +} + +void RtfStyleCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Destination == RtfReader::DESTINATION_STYLESHEET) { + //std::cerr << "Add style index: " << val << "\n"; + + //sprintf(style_attributes[0], "%i", val); + } else /*if (myState.Destination == rdsContent)*/ { + //std::cerr << "Set style index: " << val << "\n"; + + //sprintf(style_attributes[0], "%i", val); + } +} + +void RtfCodepageCommand::run(RtfReader &reader, int *parameter) const { + if (parameter != 0) { + reader.setEncoding(*parameter); + } +} + +void RtfSpecialCommand::run(RtfReader &reader, int*) const { + reader.mySpecialMode = true; +} + +RtfPictureCommand::RtfPictureCommand(shared_ptr<ZLMimeType> mimeType) : myMimeType(mimeType) { +} + +void RtfPictureCommand::run(RtfReader &reader, int*) const { + reader.myNextImageMimeType = myMimeType; +} + +void RtfFontResetCommand::run(RtfReader &reader, int*) const { + if (reader.myState.Bold) { + reader.myState.Bold = false; + reader.setFontProperty(RtfReader::FONT_BOLD); + } + if (reader.myState.Italic) { + reader.myState.Italic = false; + reader.setFontProperty(RtfReader::FONT_ITALIC); + } + if (reader.myState.Underlined) { + reader.myState.Underlined = false; + reader.setFontProperty(RtfReader::FONT_UNDERLINED); + } +} + +void RtfReader::addAction(const std::string &tag, RtfCommand *command) { + ourKeywordMap.insert(std::make_pair(tag, command)); +} + +void RtfReader::fillKeywordMap() { + if (ourKeywordMap.empty()) { + addAction("*", new RtfSpecialCommand()); + addAction("ansicpg", new RtfCodepageCommand()); + + static const char *keywordsToSkip[] = {"buptim", "colortbl", "comment", "creatim", "doccomm", "fonttbl", "footer", "footerf", "footerl", "footerr", "ftncn", "ftnsep", "ftnsepc", "header", "headerf", "headerl", "headerr", "keywords", "operator", "printim", "private1", "revtim", "rxe", "subject", "tc", "txe", "xe", 0}; + RtfCommand *skipCommand = new RtfDestinationCommand(RtfReader::DESTINATION_SKIP); + for (const char **i = keywordsToSkip; *i != 0; ++i) { + addAction(*i, skipCommand); + } + addAction("shppict", new RtfDummyCommand()); + addAction("info", new RtfDestinationCommand(RtfReader::DESTINATION_INFO)); + addAction("title", new RtfDestinationCommand(RtfReader::DESTINATION_TITLE)); + addAction("author", new RtfDestinationCommand(RtfReader::DESTINATION_AUTHOR)); + addAction("pict", new RtfDestinationCommand(RtfReader::DESTINATION_PICTURE)); + addAction("stylesheet", new RtfDestinationCommand(RtfReader::DESTINATION_STYLESHEET)); + addAction("footnote", new RtfDestinationCommand(RtfReader::DESTINATION_FOOTNOTE)); + + RtfCommand *newParagraphCommand = new RtfNewParagraphCommand(); + addAction("\n", newParagraphCommand); + addAction("\r", newParagraphCommand); + addAction("par", newParagraphCommand); + + addAction("\x09", new RtfCharCommand("\x09")); + addAction("_", new RtfCharCommand("-")); + addAction("\\", new RtfCharCommand("\\")); + addAction("{", new RtfCharCommand("{")); + addAction("}", new RtfCharCommand("}")); + addAction("bullet", new RtfCharCommand("\xE2\x80\xA2")); // • + addAction("endash", new RtfCharCommand("\xE2\x80\x93")); // – + addAction("emdash", new RtfCharCommand("\xE2\x80\x94")); // — + addAction("~", new RtfCharCommand("\xC0\xA0")); // + addAction("enspace", new RtfCharCommand("\xE2\x80\x82")); //   + addAction("emspace", new RtfCharCommand("\xE2\x80\x83")); //   + addAction("lquote", new RtfCharCommand("\xE2\x80\x98")); // ‘ + addAction("rquote", new RtfCharCommand("\xE2\x80\x99")); // ’ + addAction("ldblquote", new RtfCharCommand("\xE2\x80\x9C")); // “ + addAction("rdblquote", new RtfCharCommand("\xE2\x80\x9D")); // ” + + addAction("jpegblip", new RtfPictureCommand(ZLMimeType::IMAGE_JPEG)); + addAction("pngblip", new RtfPictureCommand(ZLMimeType::IMAGE_PNG)); + + addAction("s", new RtfStyleCommand()); + + addAction("qc", new RtfAlignmentCommand(ALIGN_CENTER)); + addAction("ql", new RtfAlignmentCommand(ALIGN_LEFT)); + addAction("qr", new RtfAlignmentCommand(ALIGN_RIGHT)); + addAction("qj", new RtfAlignmentCommand(ALIGN_JUSTIFY)); + addAction("pard", new RtfAlignmentCommand(ALIGN_UNDEFINED)); + + addAction("b", new RtfFontPropertyCommand(RtfReader::FONT_BOLD)); + addAction("i", new RtfFontPropertyCommand(RtfReader::FONT_ITALIC)); + addAction("u", new RtfFontPropertyCommand(RtfReader::FONT_UNDERLINED)); + addAction("plain", new RtfFontResetCommand()); + } +} + +bool RtfReader::parseDocument() { + enum { + READ_NORMAL_DATA, + READ_BINARY_DATA, + READ_HEX_SYMBOL, + READ_KEYWORD, + READ_KEYWORD_PARAMETER, + READ_END_OF_FILE + } parserState = READ_NORMAL_DATA; + + std::string keyword; + std::string parameterString; + std::string hexString; + int imageStartOffset = -1; + + while (!myIsInterrupted) { + const char *ptr = myStreamBuffer; + const char *end = myStreamBuffer + myStream->read(myStreamBuffer, rtfStreamBufferSize); + if (ptr == end) { + break; + } + const char *dataStart = ptr; + bool readNextChar = true; + while (ptr != end) { + switch (parserState) { + case READ_END_OF_FILE: + if (*ptr != '}' && !std::isspace(*ptr)) { + return false; + } + break; + case READ_BINARY_DATA: + // TODO: optimize + processCharData(ptr, 1); + --myBinaryDataSize; + if (myBinaryDataSize == 0) { + parserState = READ_NORMAL_DATA; + } + break; + case READ_NORMAL_DATA: + switch (*ptr) { + case '{': + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + myStateStack.push(myState); + myState.ReadDataAsHex = false; + break; + case '}': + { + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + + if (imageStartOffset >= 0) { + if (ZLMimeType::EMPTY != myNextImageMimeType) { + const int imageSize = myStream->offset() + (ptr - end) - imageStartOffset; + insertImage(myNextImageMimeType, myFileName, imageStartOffset, imageSize); + } + imageStartOffset = -1; + } + + if (myStateStack.empty()) { + parserState = READ_END_OF_FILE; + break; + } + + if (myState.Destination != myStateStack.top().Destination) { + switchDestination(myState.Destination, false); + switchDestination(myStateStack.top().Destination, true); + } + + bool oldItalic = myState.Italic; + bool oldBold = myState.Bold; + bool oldUnderlined = myState.Underlined; + ZLTextAlignmentType oldAlignment = myState.Alignment; + myState = myStateStack.top(); + myStateStack.pop(); + + if (myState.Italic != oldItalic) { + setFontProperty(RtfReader::FONT_ITALIC); + } + if (myState.Bold != oldBold) { + setFontProperty(RtfReader::FONT_BOLD); + } + if (myState.Underlined != oldUnderlined) { + setFontProperty(RtfReader::FONT_UNDERLINED); + } + if (myState.Alignment != oldAlignment) { + setAlignment(); + } + + break; + } + case '\\': + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + keyword.erase(); + parserState = READ_KEYWORD; + break; + case 0x0d: + case 0x0a: // cr and lf are noise characters... + if (ptr > dataStart) { + processCharData(dataStart, ptr - dataStart); + } + dataStart = ptr + 1; + break; + default: + if (myState.ReadDataAsHex) { + if (imageStartOffset == -1) { + imageStartOffset = myStream->offset() + (ptr - end); + } + } + break; + } + break; + case READ_HEX_SYMBOL: + hexString += *ptr; + if (hexString.size() == 2) { + char ch = std::strtol(hexString.c_str(), 0, 16); + hexString.erase(); + processCharData(&ch, 1); + parserState = READ_NORMAL_DATA; + dataStart = ptr + 1; + } + break; + case READ_KEYWORD: + if (!std::isalpha(*ptr)) { + if ((ptr == dataStart) && (keyword.empty())) { + if (*ptr == '\'') { + parserState = READ_HEX_SYMBOL; + } else { + keyword = *ptr; + processKeyword(keyword); + parserState = READ_NORMAL_DATA; + } + dataStart = ptr + 1; + } else { + keyword.append(dataStart, ptr - dataStart); + if (*ptr == '-' || std::isdigit(*ptr)) { + dataStart = ptr; + parserState = READ_KEYWORD_PARAMETER; + } else { + readNextChar = *ptr == ' '; + processKeyword(keyword); + parserState = READ_NORMAL_DATA; + dataStart = readNextChar ? ptr + 1 : ptr; + } + } + } + break; + case READ_KEYWORD_PARAMETER: + if (!std::isdigit(*ptr)) { + parameterString.append(dataStart, ptr - dataStart); + int parameter = std::atoi(parameterString.c_str()); + parameterString.erase(); + readNextChar = *ptr == ' '; + if ((keyword == "bin") && (parameter > 0)) { + myBinaryDataSize = parameter; + parserState = READ_BINARY_DATA; + } else { + processKeyword(keyword, ¶meter); + parserState = READ_NORMAL_DATA; + } + dataStart = readNextChar ? ptr + 1 : ptr; + } + break; + } + if (readNextChar) { + ++ptr; + } else { + readNextChar = true; + } + } + if (dataStart < end) { + switch (parserState) { + case READ_NORMAL_DATA: + processCharData(dataStart, end - dataStart); + case READ_KEYWORD: + keyword.append(dataStart, end - dataStart); + break; + case READ_KEYWORD_PARAMETER: + parameterString.append(dataStart, end - dataStart); + break; + default: + break; + } + } + } + + return myIsInterrupted || myStateStack.empty(); +} + +void RtfReader::processKeyword(const std::string &keyword, int *parameter) { + const bool wasSpecialMode = mySpecialMode; + mySpecialMode = false; + if (myState.Destination == RtfReader::DESTINATION_SKIP) { + return; + } + + std::map<std::string, RtfCommand*>::const_iterator it = ourKeywordMap.find(keyword); + + if (it == ourKeywordMap.end()) { + if (wasSpecialMode) { + myState.Destination = RtfReader::DESTINATION_SKIP; + } + return; + } + + it->second->run(*this, parameter); +} + +void RtfReader::processCharData(const char *data, std::size_t len, bool convert) { + if (myState.Destination != RtfReader::DESTINATION_SKIP) { + addCharData(data, len, convert); + } +} + +void RtfReader::interrupt() { + myIsInterrupted = true; +} + +bool RtfReader::readDocument(const ZLFile &file) { + myFileName = file.path(); + myStream = file.inputStream(); + if (myStream.isNull() || !myStream->open()) { + return false; + } + + fillKeywordMap(); + + myStreamBuffer = new char[rtfStreamBufferSize]; + + myIsInterrupted = false; + + mySpecialMode = false; + + myState.Alignment = ALIGN_UNDEFINED; + myState.Italic = false; + myState.Bold = false; + myState.Underlined = false; + myState.Destination = RtfReader::DESTINATION_NONE; + myState.ReadDataAsHex = false; + + bool code = parseDocument(); + + while (!myStateStack.empty()) { + myStateStack.pop(); + } + + delete[] myStreamBuffer; + myStream->close(); + + return code; +} diff --git a/reader/src/formats/rtf/RtfReader.h b/reader/src/formats/rtf/RtfReader.h new file mode 100644 index 0000000..10b037a --- /dev/null +++ b/reader/src/formats/rtf/RtfReader.h @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __RTFREADER_H__ +#define __RTFREADER_H__ + +#include <string> +#include <map> +#include <stack> +#include <ZLMimeType.h> + +#include <ZLEncodingConverter.h> + +#include <ZLTextAlignmentType.h> + +#include "../EncodedTextReader.h" + +class ZLFile; +class ZLInputStream; +class RtfCommand; + +class RtfReader : public EncodedTextReader { + +private: + static void fillKeywordMap(); + static void addAction(const std::string &tag, RtfCommand *command); + +private: + static std::map<std::string, RtfCommand*> ourKeywordMap; + +protected: + RtfReader(const std::string &encoding); + virtual ~RtfReader(); + +public: + virtual bool readDocument(const ZLFile &file); + +protected: + enum DestinationType { + DESTINATION_NONE, + DESTINATION_SKIP, + DESTINATION_INFO, + DESTINATION_TITLE, + DESTINATION_AUTHOR, + DESTINATION_PICTURE, + DESTINATION_STYLESHEET, + DESTINATION_FOOTNOTE, + }; + + enum FontProperty { + FONT_BOLD, + FONT_ITALIC, + FONT_UNDERLINED + }; + + virtual void addCharData(const char *data, std::size_t len, bool convert) = 0; + virtual void insertImage(shared_ptr<ZLMimeType> mimeType, const std::string &fileName, std::size_t startOffset, std::size_t size) = 0; + virtual void setEncoding(int code) = 0; + virtual void switchDestination(DestinationType destination, bool on) = 0; + virtual void setAlignment() = 0; + virtual void setFontProperty(FontProperty property) = 0; + virtual void newParagraph() = 0; + + void interrupt(); + +private: + bool parseDocument(); + void processKeyword(const std::string &keyword, int *parameter = 0); + void processCharData(const char *data, std::size_t len, bool convert = true); + +protected: + struct RtfReaderState { + bool Bold; + bool Italic; + bool Underlined; + ZLTextAlignmentType Alignment; + DestinationType Destination; + + bool ReadDataAsHex; + }; + + RtfReaderState myState; + +private: + bool mySpecialMode; + + std::string myFileName; + shared_ptr<ZLInputStream> myStream; + char *myStreamBuffer; + + std::stack<RtfReaderState> myStateStack; + + int myBinaryDataSize; + shared_ptr<ZLMimeType> myNextImageMimeType; + + int myIsInterrupted; + +friend class RtfNewParagraphCommand; +friend class RtfFontPropertyCommand; +friend class RtfAlignmentCommand; +friend class RtfCharCommand; +friend class RtfDestinationCommand; +friend class RtfStyleCommand; +friend class RtfSpecialCommand; +friend class RtfPictureCommand; +friend class RtfFontResetCommand; +friend class RtfCodepageCommand; +}; + +class RtfCommand { +protected: + virtual ~RtfCommand(); + +public: + virtual void run(RtfReader &reader, int *parameter) const = 0; +}; + +class RtfDummyCommand : public RtfCommand { +public: + void run(RtfReader &reader, int *parameter) const; +}; + +class RtfNewParagraphCommand : public RtfCommand { +public: + void run(RtfReader &reader, int *parameter) const; +}; + +class RtfFontPropertyCommand : public RtfCommand { + +public: + RtfFontPropertyCommand(RtfReader::FontProperty property); + void run(RtfReader &reader, int *parameter) const; + +private: + RtfReader::FontProperty myProperty; +}; + +class RtfAlignmentCommand : public RtfCommand { +public: + RtfAlignmentCommand(ZLTextAlignmentType alignment); + void run(RtfReader &reader, int *parameter) const; + +private: + ZLTextAlignmentType myAlignment; +}; + +class RtfCharCommand : public RtfCommand { +public: + RtfCharCommand(const std::string &chr); + void run(RtfReader &reader, int *parameter) const; + +private: + std::string myChar; +}; + +class RtfDestinationCommand : public RtfCommand { +public: + RtfDestinationCommand(RtfReader::DestinationType dest); + void run(RtfReader &reader, int *parameter) const; + +private: + RtfReader::DestinationType myDestination; +}; + +class RtfStyleCommand : public RtfCommand { +public: + void run(RtfReader &reader, int *parameter) const; +}; + +class RtfSpecialCommand : public RtfCommand { + void run(RtfReader &reader, int *parameter) const; +}; + +class RtfPictureCommand : public RtfCommand { +public: + RtfPictureCommand(shared_ptr<ZLMimeType> mimeType); + void run(RtfReader &reader, int *parameter) const; + +private: + const shared_ptr<ZLMimeType> myMimeType; +}; + +class RtfFontResetCommand : public RtfCommand { +public: + void run(RtfReader &reader, int *parameter) const; +}; + +class RtfCodepageCommand : public RtfCommand { +public: + void run(RtfReader &reader, int *parameter) const; +}; + +#endif /* __RTFREADER_H__ */ diff --git a/reader/src/formats/rtf/RtfReaderStream.cpp b/reader/src/formats/rtf/RtfReaderStream.cpp new file mode 100644 index 0000000..f4537f7 --- /dev/null +++ b/reader/src/formats/rtf/RtfReaderStream.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cstdlib> +#include <string> + +#include "RtfReader.h" +#include "RtfReaderStream.h" + +class RtfTextOnlyReader : public RtfReader { + +public: + RtfTextOnlyReader(char *buffer, std::size_t maxSize); + ~RtfTextOnlyReader(); + std::size_t readSize() const; + +protected: + void addCharData(const char *data, std::size_t len, bool convert); + void insertImage(shared_ptr<ZLMimeType> mimeType, const std::string &fileName, std::size_t startOffset, std::size_t size); + void setEncoding(int code); + void switchDestination(DestinationType destination, bool on); + void setAlignment(); + void setFontProperty(FontProperty property); + void newParagraph(); + + void interrupt(); + +private: + struct RtfTextOnlyReaderState { + bool ReadText; + }; + + RtfTextOnlyReaderState myCurrentState; + +private: + char* myBuffer; + const std::size_t myMaxSize; + std::size_t myFilledSize; +}; + +RtfTextOnlyReader::RtfTextOnlyReader(char *buffer, std::size_t maxSize) : RtfReader(std::string()), myBuffer(buffer), myMaxSize(maxSize), myFilledSize(0) { + myCurrentState.ReadText = true; +} + +RtfTextOnlyReader::~RtfTextOnlyReader() { +} + +void RtfTextOnlyReader::addCharData(const char *data, std::size_t len, bool) { + if (myBuffer == 0) { + return; + } + if (myCurrentState.ReadText) { + if (myFilledSize < myMaxSize) { + len = std::min((std::size_t)len, myMaxSize - myFilledSize); + std::memcpy(myBuffer + myFilledSize, data, len); + myFilledSize += len; + } + if (myFilledSize < myMaxSize) { + myBuffer[myFilledSize++]=' '; + } else { + interrupt(); + } + } +} + +std::size_t RtfTextOnlyReader::readSize() const { + return myFilledSize; +} + +void RtfTextOnlyReader::insertImage(shared_ptr<ZLMimeType>, const std::string&, std::size_t, std::size_t) { +} + +void RtfTextOnlyReader::setEncoding(int) { +} + +void RtfTextOnlyReader::switchDestination(DestinationType destination, bool on) { + switch (destination) { + case DESTINATION_NONE: + break; + case DESTINATION_SKIP: + case DESTINATION_INFO: + case DESTINATION_TITLE: + case DESTINATION_AUTHOR: + case DESTINATION_STYLESHEET: + myCurrentState.ReadText = !on; + break; + case DESTINATION_PICTURE: + myCurrentState.ReadText = !on; + break; + case DESTINATION_FOOTNOTE: + if (on) { + myCurrentState.ReadText = true; + } + break; + } +} + +void RtfTextOnlyReader::setAlignment() { +} + +void RtfTextOnlyReader::setFontProperty(FontProperty) { +} + +void RtfTextOnlyReader::newParagraph() { +} + +void RtfTextOnlyReader::interrupt() { +} + +RtfReaderStream::RtfReaderStream(const ZLFile& file, std::size_t maxSize) : myFile(file), myBuffer(0), mySize(maxSize) { +} + +RtfReaderStream::~RtfReaderStream() { + close(); +} + +bool RtfReaderStream::open() { + if (mySize != 0) { + myBuffer = new char[mySize]; + } + RtfTextOnlyReader reader(myBuffer, mySize); + reader.readDocument(myFile); + mySize = reader.readSize(); + myOffset = 0; + return true; +} + +std::size_t RtfReaderStream::read(char *buffer, std::size_t maxSize) { + maxSize = std::min(maxSize, mySize - myOffset); + if ((buffer != 0) && (myBuffer !=0)) { + std::memcpy(buffer, myBuffer + myOffset, maxSize); + } + myOffset += maxSize; + return maxSize; +} + +void RtfReaderStream::close() { + if (myBuffer != 0) { + delete[] myBuffer; + myBuffer = 0; + } +} + +void RtfReaderStream::seek(int offset, bool absoluteOffset) { + if (!absoluteOffset) { + offset += myOffset; + } + myOffset = std::min(mySize, (std::size_t)std::max(0, offset)); +} + +std::size_t RtfReaderStream::offset() const { + return myOffset; +} + +std::size_t RtfReaderStream::sizeOfOpened() { + return mySize; +} + diff --git a/reader/src/formats/rtf/RtfReaderStream.h b/reader/src/formats/rtf/RtfReaderStream.h new file mode 100644 index 0000000..71555b4 --- /dev/null +++ b/reader/src/formats/rtf/RtfReaderStream.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __RTFREADERSTREAM_H__ +#define __RTFREADERSTREAM_H__ + +#include <string> + +#include <ZLFile.h> +#include <ZLInputStream.h> + +class RtfReaderStream : public ZLInputStream { + +public: + RtfReaderStream(const ZLFile& file, std::size_t maxSize); + ~RtfReaderStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + const ZLFile myFile; + char *myBuffer; + std::size_t mySize; + std::size_t myOffset; +}; + +#endif /* __RTFREADERSTREAM_H__ */ diff --git a/reader/src/formats/tcr/PPLBookReader.cpp b/reader/src/formats/tcr/PPLBookReader.cpp new file mode 100644 index 0000000..9b7d271 --- /dev/null +++ b/reader/src/formats/tcr/PPLBookReader.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cctype> + +#include "PPLBookReader.h" +#include <ZLInputStream.h> + +static const std::size_t BUFFER_SIZE = 2048; + +PPLBookReader::PPLBookReader(BookModel &model, const std::string &encoding) : EncodedTextReader(encoding), myModelReader(model) { + myBuffer = new char[BUFFER_SIZE + 1]; +} + +PPLBookReader::~PPLBookReader() { + delete[] myBuffer; +} + +bool PPLBookReader::currentParagraphIsEmpty() const { + const char *ptr = myCurrentParagraph.data(); + const char *end = ptr + myCurrentParagraph.length(); + for (; ptr < end; ++ptr) { + if (!std::isspace((unsigned char)*ptr)) { + return false; + } + } + return true; +} + +void PPLBookReader::addParagraph() { + static const std::string END_OF_TEXT = "<* >"; + if (!myCurrentParagraph.empty()) { + if (currentParagraphIsEmpty()) { + ++myEmptyLineCounter; + if (myEmptyLineCounter >= 2) { + myModelReader.beginParagraph(ZLTextParagraph::EMPTY_LINE_PARAGRAPH); + myModelReader.endParagraph(); + } + } else if (myEmptyLineCounter < 2) { + myModelReader.beginParagraph(); + myModelReader.addControl(TITLE, true); + myModelReader.addData(myCurrentParagraph); + myModelReader.endParagraph(); + } else if (myCurrentParagraph[0] == 9) { + myModelReader.beginParagraph(); + myModelReader.addData(myCurrentParagraph); + myModelReader.endParagraph(); + } else if ((myCurrentParagraph.length() >= 2) && + (myCurrentParagraph[0] == '*') && + (myCurrentParagraph[1] == ' ')) { + myCurrentParagraph.erase(0, 2); + myModelReader.insertEndOfSectionParagraph(); + myModelReader.beginContentsParagraph(); + myModelReader.addContentsData(myCurrentParagraph); + myModelReader.endContentsParagraph(); + myModelReader.beginParagraph(); + myModelReader.addControl(SECTION_TITLE, true); + myModelReader.addData(myCurrentParagraph); + myModelReader.endParagraph(); + } else if (myCurrentParagraph.substr(0, 4) != END_OF_TEXT) { + myModelReader.beginParagraph(); + myModelReader.addControl(SUBTITLE, true); + myModelReader.addData(myCurrentParagraph); + myModelReader.endParagraph(); + } + myCurrentParagraph.erase(); + } +} + +bool PPLBookReader::readDocument(ZLInputStream &stream) { + if (!stream.open()) { + return false; + } + + myModelReader.setMainTextModel(); + myModelReader.pushKind(REGULAR); + myCurrentParagraph.erase(); + myEmptyLineCounter = 0; + + // "PPL\r\n" + stream.seek(5, false); + + std::size_t size; + do { + size = stream.read(myBuffer, BUFFER_SIZE); + myBuffer[size] = '\0'; + + const char *start = myBuffer; + const char *end = myBuffer + size; + const char *eol; + do { + eol = std::strchr(start, '\n'); + if (eol != 0) { + if (start < eol) { + myConverter->convert(myCurrentParagraph, start, eol); + } + addParagraph(); + start = eol + 1; + } else { + if (start < end) { + myConverter->convert(myCurrentParagraph, start, end); + } + } + } while (eol != 0); + } while (size == BUFFER_SIZE); + + addParagraph(); + + stream.close(); + + return true; +} diff --git a/reader/src/formats/tcr/PPLBookReader.h b/reader/src/formats/tcr/PPLBookReader.h new file mode 100644 index 0000000..98c7f9d --- /dev/null +++ b/reader/src/formats/tcr/PPLBookReader.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PPLBOOKREADER_H__ +#define __PPLBOOKREADER_H__ + +#include <shared_ptr.h> +#include <ZLEncodingConverter.h> +#include "../../bookmodel/BookReader.h" +#include "../EncodedTextReader.h" + +class ZLInputStream; +class BookModel; + +class PPLBookReader : public EncodedTextReader { + +public: + PPLBookReader(BookModel &model, const std::string &encoding); + ~PPLBookReader(); + + bool readDocument(ZLInputStream &stream); + +private: + bool currentParagraphIsEmpty() const; + void addParagraph(); + +private: + BookReader myModelReader; + + char *myBuffer; + std::string myCurrentParagraph; + int myEmptyLineCounter; +}; + +#endif /* __PPLBOOKREADER_H__ */ diff --git a/reader/src/formats/tcr/TcrPlugin.cpp b/reader/src/formats/tcr/TcrPlugin.cpp new file mode 100644 index 0000000..8ee0f14 --- /dev/null +++ b/reader/src/formats/tcr/TcrPlugin.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "TcrPlugin.h" +#include "TcrStream.h" +#include "PPLBookReader.h" +#include "../util/TextFormatDetector.h" +#include "../txt/TxtBookReader.h" +#include "../html/HtmlBookReader.h" +#include "../txt/PlainTextFormat.h" + +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +bool TcrPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "tcr"; +} + +bool TcrPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = new TcrStream(book.file()); + detectEncodingAndLanguage(book, *stream); + if (book.encoding().empty()) { + return false; + } + + return true; +} + +bool TcrPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +bool TcrPlugin::readModel(BookModel &model) const { + const Book &book = *model.book(); + const ZLFile &file = book.file(); + + shared_ptr<ZLInputStream> stream = new TcrStream(file); + + PlainTextFormat format(file); + if (!format.initialized()) { + PlainTextFormatDetector detector; + detector.detect(*stream, format); + } + + const std::string &encoding = book.encoding(); + if (TextFormatDetector().isPPL(*stream)) { + PPLBookReader(model, encoding).readDocument(*stream); + } else if (TextFormatDetector().isHtml(*stream)) { + HtmlBookReader("", model, format, encoding).readDocument(*stream); + } else { + TxtBookReader(model, format, encoding).readDocument(*stream); + } + return true; +} + +FormatInfoPage *TcrPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + shared_ptr<ZLInputStream> stream = new TcrStream(file); + if (TextFormatDetector().isPPL(*stream)) { + return 0; + } + return new PlainTextInfoPage(dialog, file, ZLResourceKey("Text"), !TextFormatDetector().isHtml(*stream)); +} diff --git a/reader/src/formats/tcr/TcrPlugin.h b/reader/src/formats/tcr/TcrPlugin.h new file mode 100644 index 0000000..9655892 --- /dev/null +++ b/reader/src/formats/tcr/TcrPlugin.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TCRPLUGIN_H__ +#define __TCRPLUGIN_H__ + +#include "../FormatPlugin.h" + +class TcrPlugin : public FormatPlugin { + +public: + TcrPlugin(); + ~TcrPlugin(); + + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +inline TcrPlugin::TcrPlugin() {} +inline TcrPlugin::~TcrPlugin() {} +inline bool TcrPlugin::providesMetaInfo() const { return false; } + +#endif /* __TCRPLUGIN_H__ */ diff --git a/reader/src/formats/tcr/TcrStream.cpp b/reader/src/formats/tcr/TcrStream.cpp new file mode 100644 index 0000000..cf4e540 --- /dev/null +++ b/reader/src/formats/tcr/TcrStream.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <algorithm> + +#include <ZLFile.h> +#include <ZLZDecompressor.h> + +#include "TcrStream.h" + +TcrStream::TcrStream(const ZLFile &file) : myBase(file.inputStream()) { +} + +TcrStream::~TcrStream() { + close(); +} + +bool TcrStream::open() { + close(); + if (myBase.isNull() || !myBase->open()) { + return false; + } + + char header[9]; + if (myBase->read(header, 9) != 9 || std::strncmp(header, "!!8-Bit!!", 9) != 0) { + myBase->close(); + return false; + } + + unsigned char entryLength; + char entryBuffer[255]; + for (int i = 0; i < 256; ++i) { + if (myBase->read((char*)&entryLength, 1) != 1 || + (entryLength > 0 && myBase->read(entryBuffer, entryLength) != entryLength)) { + myBase->close(); + return false; + } + if (entryLength > 0) { + myDictionary[i].append(entryBuffer, entryLength); + } + } + + return true; +} + +void TcrStream::close() { + if (!myBase.isNull()) { + myBase->close(); + } + for (int i = 0; i < 256; ++i) { + myDictionary[i].erase(); + } + myBuffer.erase(); +} + +std::size_t TcrStream::read(char *buffer, std::size_t maxSize) { + std::size_t size = 0; + if (myBuffer.length() > 0) { + size += std::min(maxSize, myBuffer.length()); + if (buffer != 0) { + std::strncpy(buffer, myBuffer.data(), size); + } + myBuffer.erase(0, size); + } + while (size < maxSize) { + unsigned char index; + if (myBase->read((char*)&index, 1) != 1) { + break; + } + std::size_t len = myDictionary[index].length(); + if (len > 0) { + std::size_t freeSize = maxSize - size; + if (buffer != 0) { + std::strncpy(buffer + size, myDictionary[index].data(), std::min(len, freeSize)); + } + size += std::min(len, freeSize); + if (len > freeSize) { + myBuffer = myDictionary[index].substr(freeSize); + } + } + } + myOffset += size; + return size; +} + +void TcrStream::seek(int offset, bool absoluteOffset) { + if (absoluteOffset) { + offset -= this->offset(); + } + if (offset > 0) { + read(0, offset); + } else if (offset < 0) { + offset += this->offset(); + open(); + if (offset >= 0) { + read(0, offset); + } + } +} + +std::size_t TcrStream::offset() const { + return myOffset; +} + +std::size_t TcrStream::sizeOfOpened() { + // TODO: implement + return 0; +} diff --git a/reader/src/formats/tcr/TcrStream.h b/reader/src/formats/tcr/TcrStream.h new file mode 100644 index 0000000..0a9d212 --- /dev/null +++ b/reader/src/formats/tcr/TcrStream.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TCRSTREAM_H__ +#define __TCRSTREAM_H__ + +#include <ZLInputStream.h> + +class ZLFile; + +class TcrStream : public ZLInputStream { + +public: + TcrStream(const ZLFile &file); + virtual ~TcrStream(); + bool open(); + virtual void close(); + + std::size_t read(char *buffer, std::size_t maxSize); + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +protected: + std::string myDictionary[256]; + std::string myBuffer; + shared_ptr<ZLInputStream> myBase; + std::size_t myOffset; +}; + +#endif /* __TCRSTREAM_H__ */ diff --git a/reader/src/formats/txt/PlainTextFormat.cpp b/reader/src/formats/txt/PlainTextFormat.cpp new file mode 100644 index 0000000..7c9360f --- /dev/null +++ b/reader/src/formats/txt/PlainTextFormat.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> +#include <algorithm> + +#include <ZLOptions.h> +#include <ZLOptionsDialog.h> +#include <ZLOptionEntry.h> +#include <ZLFile.h> + +#include "PlainTextFormat.h" + +#include "../../options/FBCategoryKey.h" + +const std::string OPTION_Initialized = "Initialized"; +const std::string OPTION_BreakType = "BreakType"; +const std::string OPTION_IgnoredIndent = "IgnoredIndent"; +const std::string OPTION_EmptyLinesBeforeNewSection = "EmptyLinesBeforeNewSection"; +const std::string OPTION_CreateContentsTable = "CreateContentsTable"; + +PlainTextFormat::PlainTextFormat(const ZLFile &file) : + InitializedOption(FBCategoryKey::BOOKS, file.path(), OPTION_Initialized, false), + BreakTypeOption(FBCategoryKey::BOOKS, file.path(), OPTION_BreakType, 1), + IgnoredIndentOption(FBCategoryKey::BOOKS, file.path(), OPTION_IgnoredIndent, 1, 100, 1), + EmptyLinesBeforeNewSectionOption(FBCategoryKey::BOOKS, file.path(), OPTION_EmptyLinesBeforeNewSection, 1, 100, 1), + CreateContentsTableOption(FBCategoryKey::BOOKS, file.path(), OPTION_CreateContentsTable, false) { +} + +PlainTextInfoPage::PlainTextInfoPage(ZLOptionsDialog &dialog, const ZLFile &file, const ZLResourceKey &key, bool showContentsEntry) : myFormat(file) { + if (!myFormat.initialized()) { + PlainTextFormatDetector detector; + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (!stream.isNull()) { + detector.detect(*stream, myFormat); + } + } + + ZLDialogContent &tab = dialog.createTab(key); + + BreakTypeOptionEntry *breakEntry = new BreakTypeOptionEntry(*this, myFormat.BreakTypeOption); + myIgnoredIndentEntry = new ZLSimpleSpinOptionEntry(myFormat.IgnoredIndentOption, 1); + tab.addOption(ZLResourceKey("breakType"), breakEntry); + tab.addOption(ZLResourceKey("ignoreIndent"), myIgnoredIndentEntry); + breakEntry->onValueSelected(breakEntry->initialIndex()); + + if (showContentsEntry) { + CreateContentsTableOptionEntry *contentsTableEntry = new CreateContentsTableOptionEntry(*this, myFormat.CreateContentsTableOption); + myEmptyLinesBeforeNewSectionEntry = new ZLSimpleSpinOptionEntry(myFormat.EmptyLinesBeforeNewSectionOption, 1); + tab.addOption(ZLResourceKey("buildTOC"), contentsTableEntry); + tab.addOption(ZLResourceKey("emptyLines"), myEmptyLinesBeforeNewSectionEntry); + contentsTableEntry->onStateChanged(contentsTableEntry->initialState()); + } +} + +PlainTextInfoPage::~PlainTextInfoPage() { +} + +const int BUFFER_SIZE = 4096; + +void PlainTextFormatDetector::detect(ZLInputStream &stream, PlainTextFormat &format) { + if (!stream.open()) { + return; + } + + const unsigned int tableSize = 10; + + unsigned int lineCounter = 0; + int emptyLineCounter = -1; + unsigned int stringsWithLengthLessThan81Counter = 0; + unsigned int stringIndentTable[tableSize] = { 0 }; + unsigned int emptyLinesTable[tableSize] = { 0 }; + unsigned int emptyLinesBeforeShortStringTable[tableSize] = { 0 }; + + bool currentLineIsEmpty = true; + unsigned int currentLineLength = 0; + unsigned int currentLineIndent = 0; + int currentNumberOfEmptyLines = -1; + + char *buffer = new char[BUFFER_SIZE]; + int length; + char previous = 0; + do { + length = stream.read(buffer, BUFFER_SIZE); + const char *end = buffer + length; + for (const char *ptr = buffer; ptr != end; ++ptr) { + ++currentLineLength; + if (*ptr == '\n') { + ++lineCounter; + if (currentLineIsEmpty) { + ++emptyLineCounter; + ++currentNumberOfEmptyLines; + } else { + if (currentNumberOfEmptyLines >= 0) { + int index = std::min(currentNumberOfEmptyLines, (int)tableSize - 1); + emptyLinesTable[index]++; + if (currentLineLength < 51) { + emptyLinesBeforeShortStringTable[index]++; + } + } + currentNumberOfEmptyLines = -1; + } + if (currentLineLength < 81) { + ++stringsWithLengthLessThan81Counter; + } + if (!currentLineIsEmpty) { + stringIndentTable[std::min(currentLineIndent, tableSize - 1)]++; + } + + currentLineIsEmpty = true; + currentLineLength = 0; + currentLineIndent = 0; + } else if (*ptr == '\r') { + continue; + } else if (std::isspace((unsigned char)*ptr)) { + if (currentLineIsEmpty) { + ++currentLineIndent; + } + } else { + currentLineIsEmpty = false; + } + previous = *ptr; + } + } while (length == BUFFER_SIZE); + delete[] buffer; + + unsigned int nonEmptyLineCounter = lineCounter - emptyLineCounter; + + { + unsigned int indent = 0; + unsigned int lineWithIndent = 0; + for (; indent < tableSize; ++indent) { + lineWithIndent += stringIndentTable[indent]; + if (lineWithIndent > 0.1 * nonEmptyLineCounter) { + break; + } + } + format.IgnoredIndentOption.setValue(indent + 1); + } + + { + int breakType = 0; + breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE; + if (stringsWithLengthLessThan81Counter < 0.3 * nonEmptyLineCounter) { + breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE; + } else { + breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT; + } + format.BreakTypeOption.setValue(breakType); + } + + { + unsigned int max = 0; + unsigned index; + int emptyLinesBeforeNewSection = -1; + for (index = 2; index < tableSize; ++index) { + if (max < emptyLinesBeforeShortStringTable[index]) { + max = emptyLinesBeforeShortStringTable[index]; + emptyLinesBeforeNewSection = index; + } + } + if (emptyLinesBeforeNewSection > 0) { + for (index = tableSize - 1; index > 0; --index) { + emptyLinesTable[index - 1] += emptyLinesTable[index]; + emptyLinesBeforeShortStringTable[index - 1] += emptyLinesBeforeShortStringTable[index]; + } + for (index = emptyLinesBeforeNewSection; index < tableSize; ++index) { + if ((emptyLinesBeforeShortStringTable[index] > 2) && + (emptyLinesBeforeShortStringTable[index] > 0.7 * emptyLinesTable[index])) { + break; + } + } + emptyLinesBeforeNewSection = (index == tableSize) ? -1 : (int)index; + } + format.EmptyLinesBeforeNewSectionOption.setValue(emptyLinesBeforeNewSection); + format.CreateContentsTableOption.setValue(emptyLinesBeforeNewSection > 0); + } + + format.InitializedOption.setValue(true); +} + +BreakTypeOptionEntry::BreakTypeOptionEntry(PlainTextInfoPage &page, ZLIntegerOption &breakTypeOption) : myPage(page), myBreakTypeOption(breakTypeOption) { +} + +BreakTypeOptionEntry::~BreakTypeOptionEntry() { +} + +static std::vector<std::string> BREAK_TYPE_VALUES_VECTOR; + +int BreakTypeOptionEntry::initialIndex() const { + switch (myBreakTypeOption.value()) { + case PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE: + return 0; + case PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE: + return 1; + case PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE | PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT: + default: + return 2; + } +} + +const std::string &BreakTypeOptionEntry::initialValue() const { + return values()[initialIndex()]; +} + +const std::vector<std::string> &BreakTypeOptionEntry::values() const { + if (BREAK_TYPE_VALUES_VECTOR.empty()) { + BREAK_TYPE_VALUES_VECTOR.push_back("New Line"); + BREAK_TYPE_VALUES_VECTOR.push_back("Empty Line"); + BREAK_TYPE_VALUES_VECTOR.push_back("Line With Indent"); + } + return BREAK_TYPE_VALUES_VECTOR; +} + +void BreakTypeOptionEntry::onAccept(const std::string &value) { + if (value == values()[0]) { + myBreakTypeOption.setValue(PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE); + } else if (value == values()[1]) { + myBreakTypeOption.setValue(PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE); + } else if (value == values()[2]) { + myBreakTypeOption.setValue(PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE | PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT); + } +} + +void BreakTypeOptionEntry::onValueSelected(int index) { + myPage.myIgnoredIndentEntry->setVisible(index == 2); +} + +CreateContentsTableOptionEntry::CreateContentsTableOptionEntry(PlainTextInfoPage &page, ZLBooleanOption &option) : ZLSimpleBooleanOptionEntry(option), myPage(page) { +} + +CreateContentsTableOptionEntry::~CreateContentsTableOptionEntry() { +} + +void CreateContentsTableOptionEntry::onStateChanged(bool state) { + myPage.myEmptyLinesBeforeNewSectionEntry->setVisible(state); +} diff --git a/reader/src/formats/txt/PlainTextFormat.h b/reader/src/formats/txt/PlainTextFormat.h new file mode 100644 index 0000000..32ca258 --- /dev/null +++ b/reader/src/formats/txt/PlainTextFormat.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __PLAINTEXTFORMAT_H__ +#define __PLAINTEXTFORMAT_H__ + +#include <ZLInputStream.h> +#include <ZLOptions.h> +#include <ZLSimpleOptionEntry.h> +#include <ZLResource.h> + +#include "../FormatPlugin.h" + +class PlainTextFormat { + +public: + enum ParagraphBreakType { + BREAK_PARAGRAPH_AT_NEW_LINE = 1, + BREAK_PARAGRAPH_AT_EMPTY_LINE = 2, + BREAK_PARAGRAPH_AT_LINE_WITH_INDENT = 4, + }; + + PlainTextFormat(const ZLFile &file); + ~PlainTextFormat() {} + + bool initialized() const { return InitializedOption.value(); } + int breakType() const { return BreakTypeOption.value(); } + int ignoredIndent() const { return IgnoredIndentOption.value(); } + int emptyLinesBeforeNewSection() const { return EmptyLinesBeforeNewSectionOption.value(); } + bool createContentsTable() const { return CreateContentsTableOption.value(); } + +private: + ZLBooleanOption InitializedOption; + ZLIntegerOption BreakTypeOption; + ZLIntegerRangeOption IgnoredIndentOption; + ZLIntegerRangeOption EmptyLinesBeforeNewSectionOption; + ZLBooleanOption CreateContentsTableOption; + +friend class PlainTextInfoPage; +friend class PlainTextFormatDetector; +}; + +class PlainTextInfoPage : public FormatInfoPage { + +public: + PlainTextInfoPage(ZLOptionsDialog &dialog, const ZLFile &file, const ZLResourceKey &key, bool showContentsEntry); + ~PlainTextInfoPage(); + +private: + PlainTextFormat myFormat; + + ZLSimpleSpinOptionEntry *myIgnoredIndentEntry; + ZLSimpleSpinOptionEntry *myEmptyLinesBeforeNewSectionEntry; + +friend class BreakTypeOptionEntry; +friend class CreateContentsTableOptionEntry; +}; + +class PlainTextFormatDetector { + +public: + PlainTextFormatDetector() {} + ~PlainTextFormatDetector() {} + + void detect(ZLInputStream &stream, PlainTextFormat &format); +}; + +class BreakTypeOptionEntry : public ZLComboOptionEntry { + +public: + BreakTypeOptionEntry(PlainTextInfoPage &page, ZLIntegerOption &breakTypeOption); + ~BreakTypeOptionEntry(); + + int initialIndex() const; + const std::string &initialValue() const; + const std::vector<std::string> &values() const; + void onAccept(const std::string &value); + void onValueSelected(int index); + +private: + PlainTextInfoPage &myPage; + ZLIntegerOption &myBreakTypeOption; +}; + +class CreateContentsTableOptionEntry : public ZLSimpleBooleanOptionEntry { + +public: + CreateContentsTableOptionEntry(PlainTextInfoPage &page, ZLBooleanOption &option); + ~CreateContentsTableOptionEntry(); + void onStateChanged(bool state); + +private: + PlainTextInfoPage &myPage; +}; + +#endif /* __PLAINTEXTFORMAT_H__ */ diff --git a/reader/src/formats/txt/TxtBookReader.cpp b/reader/src/formats/txt/TxtBookReader.cpp new file mode 100644 index 0000000..c68ea2c --- /dev/null +++ b/reader/src/formats/txt/TxtBookReader.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include "TxtBookReader.h" +#include "../../bookmodel/BookModel.h" + +TxtBookReader::TxtBookReader(BookModel &model, const PlainTextFormat &format, const std::string &encoding) : TxtReader(encoding), BookReader(model), myFormat(format) { +} + +void TxtBookReader::internalEndParagraph() { + if (!myLastLineIsEmpty) { + //myLineFeedCounter = 0; + myLineFeedCounter = -1; /* Fixed by Hatred: zero value was break LINE INDENT formater - + second line print with indent like new paragraf */ + } + myLastLineIsEmpty = true; + endParagraph(); +} + +bool TxtBookReader::characterDataHandler(std::string &str) { + const char *ptr = str.data(); + const char *end = ptr + str.length(); + for (; ptr != end; ++ptr) { + if (std::isspace((unsigned char)*ptr)) { + if (*ptr != '\t') { + ++mySpaceCounter; + } else { + mySpaceCounter += myFormat.ignoredIndent() + 1; // TODO: implement single option in PlainTextFormat + } + } else { + myLastLineIsEmpty = false; + break; + } + } + if (ptr != end) { + if ((myFormat.breakType() & PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT) && + myNewLine && (mySpaceCounter > myFormat.ignoredIndent())) { + internalEndParagraph(); + beginParagraph(); + } + addData(str); + if (myInsideContentsParagraph) { + addContentsData(str); + } + myNewLine = false; + } + return true; +} + +bool TxtBookReader::newLineHandler() { + if (!myLastLineIsEmpty) { + myLineFeedCounter = -1; + } + myLastLineIsEmpty = true; + ++myLineFeedCounter; + myNewLine = true; + mySpaceCounter = 0; + bool paragraphBreak = + (myFormat.breakType() & PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE) || + ((myFormat.breakType() & PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE) && (myLineFeedCounter > 0)); + + if (myFormat.createContentsTable()) { +// if (!myInsideContentsParagraph && (myLineFeedCounter == myFormat.emptyLinesBeforeNewSection() + 1)) { + /* Fixed by Hatred: remove '+ 1' for emptyLinesBeforeNewSection, it looks like very strange + when we should point count of empty string decrised by 1 in settings dialog */ + if (!myInsideContentsParagraph && (myLineFeedCounter == myFormat.emptyLinesBeforeNewSection())) { + myInsideContentsParagraph = true; + internalEndParagraph(); + insertEndOfSectionParagraph(); + beginContentsParagraph(); + enterTitle(); + pushKind(SECTION_TITLE); + beginParagraph(); + paragraphBreak = false; + } + if (myInsideContentsParagraph && (myLineFeedCounter == 1)) { + exitTitle(); + endContentsParagraph(); + popKind(); + myInsideContentsParagraph = false; + paragraphBreak = true; + } + } + + if (paragraphBreak) { + internalEndParagraph(); + beginParagraph(); + } + return true; +} + +void TxtBookReader::startDocumentHandler() { + setMainTextModel(); + pushKind(REGULAR); + beginParagraph(); + myLineFeedCounter = 0; + myInsideContentsParagraph = false; + enterTitle(); + myLastLineIsEmpty = true; + myNewLine = true; + mySpaceCounter = 0; +} + +void TxtBookReader::endDocumentHandler() { + internalEndParagraph(); +} diff --git a/reader/src/formats/txt/TxtBookReader.h b/reader/src/formats/txt/TxtBookReader.h new file mode 100644 index 0000000..e02ad2a --- /dev/null +++ b/reader/src/formats/txt/TxtBookReader.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TXTBOOKREADER_H__ +#define __TXTBOOKREADER_H__ + +#include <stack> + +#include "TxtReader.h" +#include "PlainTextFormat.h" +#include "../../bookmodel/BookReader.h" + +class BookModel; + +class TxtBookReader : public TxtReader, public BookReader { + +public: + TxtBookReader(BookModel &model, const PlainTextFormat &format, const std::string &encoding); + ~TxtBookReader(); + +protected: + void startDocumentHandler(); + void endDocumentHandler(); + + bool characterDataHandler(std::string &str); + bool newLineHandler(); + +private: + void internalEndParagraph(); + +private: + const PlainTextFormat &myFormat; + + int myLineFeedCounter; + bool myInsideContentsParagraph; + bool myLastLineIsEmpty; + bool myNewLine; + int mySpaceCounter; +}; + +inline TxtBookReader::~TxtBookReader() {} + +#endif /* __TXTBOOKREADER_H__ */ diff --git a/reader/src/formats/txt/TxtPlugin.cpp b/reader/src/formats/txt/TxtPlugin.cpp new file mode 100644 index 0000000..b155c2f --- /dev/null +++ b/reader/src/formats/txt/TxtPlugin.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLFile.h> +#include <ZLInputStream.h> + +#include "TxtPlugin.h" +#include "TxtBookReader.h" +#include "PlainTextFormat.h" + +#include "../../bookmodel/BookModel.h" +#include "../../library/Book.h" + +TxtPlugin::~TxtPlugin() { +} + +bool TxtPlugin::providesMetaInfo() const { + return false; +} + +bool TxtPlugin::acceptsFile(const ZLFile &file) const { + return file.extension() == "txt"; +} + +bool TxtPlugin::readMetaInfo(Book &book) const { + shared_ptr<ZLInputStream> stream = book.file().inputStream(); + if (stream.isNull()) { + return false; + } + detectEncodingAndLanguage(book, *stream); + if (book.encoding().empty()) { + return false; + } + + return true; +} + +bool TxtPlugin::readLanguageAndEncoding(Book &book) const { + (void)book; + return true; +} + +bool TxtPlugin::readModel(BookModel &model) const { + const Book &book = *model.book(); + const ZLFile &file = book.file(); + shared_ptr<ZLInputStream> stream = file.inputStream(); + if (stream.isNull()) { + return false; + } + + PlainTextFormat format(file); + if (!format.initialized()) { + PlainTextFormatDetector detector; + detector.detect(*stream, format); + } + + TxtBookReader(model, format, book.encoding()).readDocument(*stream); + return true; +} + +FormatInfoPage *TxtPlugin::createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file) { + return new PlainTextInfoPage(dialog, file, ZLResourceKey("Text"), true); +} diff --git a/reader/src/formats/txt/TxtPlugin.h b/reader/src/formats/txt/TxtPlugin.h new file mode 100644 index 0000000..e3e6e50 --- /dev/null +++ b/reader/src/formats/txt/TxtPlugin.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TXTPLUGIN_H__ +#define __TXTPLUGIN_H__ + +#include "../FormatPlugin.h" + +class TxtPlugin : public FormatPlugin { + +public: + ~TxtPlugin(); + bool providesMetaInfo() const; + bool acceptsFile(const ZLFile &file) const; + bool readMetaInfo(Book &book) const; + bool readLanguageAndEncoding(Book &book) const; + bool readModel(BookModel &model) const; + FormatInfoPage *createInfoPage(ZLOptionsDialog &dialog, const ZLFile &file); +}; + +#endif /* __TXTPLUGIN_H__ */ diff --git a/reader/src/formats/txt/TxtReader.cpp b/reader/src/formats/txt/TxtReader.cpp new file mode 100644 index 0000000..d2f5659 --- /dev/null +++ b/reader/src/formats/txt/TxtReader.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cctype> + +#include <ZLInputStream.h> + +#include "TxtReader.h" + +class TxtReaderCore { + +public: + TxtReaderCore(TxtReader &reader); + virtual void readDocument(ZLInputStream &stream); + +protected: + TxtReader &myReader; +}; + +class TxtReaderCoreUtf16 : public TxtReaderCore { + +public: + TxtReaderCoreUtf16(TxtReader &reader); + void readDocument(ZLInputStream &stream); + +protected: + virtual char getAscii(const char *ptr) = 0; + virtual void setAscii(char *ptr, char ascii) = 0; +}; + +class TxtReaderCoreUtf16LE : public TxtReaderCoreUtf16 { + +public: + TxtReaderCoreUtf16LE(TxtReader &reader); + +protected: + char getAscii(const char *ptr); + void setAscii(char *ptr, char ascii); +}; + +class TxtReaderCoreUtf16BE : public TxtReaderCoreUtf16 { + +public: + TxtReaderCoreUtf16BE(TxtReader &reader); + +protected: + char getAscii(const char *ptr); + void setAscii(char *ptr, char ascii); +}; + +TxtReader::TxtReader(const std::string &encoding) : EncodedTextReader(encoding) { + if (ZLEncodingConverter::UTF16 == encoding) { + myCore = new TxtReaderCoreUtf16LE(*this); + } else if (ZLEncodingConverter::UTF16BE == encoding) { + myCore = new TxtReaderCoreUtf16BE(*this); + } else { + myCore = new TxtReaderCore(*this); + } +} + +TxtReader::~TxtReader() { +} + +void TxtReader::readDocument(ZLInputStream &stream) { + if (!stream.open()) { + return; + } + startDocumentHandler(); + myCore->readDocument(stream); + endDocumentHandler(); + stream.close(); +} + +TxtReaderCore::TxtReaderCore(TxtReader &reader) : myReader(reader) { +} + +TxtReaderCoreUtf16::TxtReaderCoreUtf16(TxtReader &reader) : TxtReaderCore(reader) { +} + +void TxtReaderCore::readDocument(ZLInputStream &stream) { + const std::size_t BUFSIZE = 2048; + char *buffer = new char[BUFSIZE]; + std::string str; + std::size_t length; + do { + length = stream.read(buffer, BUFSIZE); + char *start = buffer; + const char *end = buffer + length; + for (char *ptr = start; ptr != end; ++ptr) { + if (*ptr == '\n' || *ptr == '\r') { + bool skipNewLine = false; + if (*ptr == '\r' && (ptr + 1) != end && *(ptr + 1) == '\n') { + skipNewLine = true; + *ptr = '\n'; + } + if (start != ptr) { + str.erase(); + myReader.myConverter->convert(str, start, ptr + 1); + myReader.characterDataHandler(str); + } + if (skipNewLine) { + ++ptr; + } + start = ptr + 1; + myReader.newLineHandler(); + } else if (((*ptr) & 0x80) == 0 && std::isspace((unsigned char)*ptr)) { + if (*ptr != '\t') { + *ptr = ' '; + } + } else { + } + } + if (start != end) { + str.erase(); + myReader.myConverter->convert(str, start, end); + myReader.characterDataHandler(str); + } + } while (length == BUFSIZE); + delete[] buffer; +} + +void TxtReaderCoreUtf16::readDocument(ZLInputStream &stream) { + const std::size_t BUFSIZE = 2048; + char *buffer = new char[BUFSIZE]; + std::string str; + std::size_t length; + do { + length = stream.read(buffer, BUFSIZE); + char *start = buffer; + const char *end = buffer + length; + for (char *ptr = start; ptr < end; ptr += 2) { + const char chr = getAscii(ptr); + if (chr == '\n' || chr == '\r') { + bool skipNewLine = false; + if (chr == '\r' && ptr + 2 != end && getAscii(ptr + 2) == '\n') { + skipNewLine = true; + setAscii(ptr, '\n'); + } + if (start != ptr) { + str.erase(); + myReader.myConverter->convert(str, start, ptr + 2); + myReader.characterDataHandler(str); + } + if (skipNewLine) { + ptr += 2; + } + start = ptr + 2; + myReader.newLineHandler(); + } else if (chr != 0 && ((*ptr) & 0x80) == 0 && std::isspace(chr)) { + if (chr != '\t') { + setAscii(ptr, ' '); + } + } + } + if (start != end) { + str.erase(); + myReader.myConverter->convert(str, start, end); + myReader.characterDataHandler(str); + } + } while (length == BUFSIZE); + delete[] buffer; +} + +TxtReaderCoreUtf16LE::TxtReaderCoreUtf16LE(TxtReader &reader) : TxtReaderCoreUtf16(reader) { +} + +char TxtReaderCoreUtf16LE::getAscii(const char *ptr) { + return *(ptr + 1) == '\0' ? *ptr : '\0'; +} + +void TxtReaderCoreUtf16LE::setAscii(char *ptr, char ascii) { + *ptr = ascii; +} + +TxtReaderCoreUtf16BE::TxtReaderCoreUtf16BE(TxtReader &reader) : TxtReaderCoreUtf16(reader) { +} + +char TxtReaderCoreUtf16BE::getAscii(const char *ptr) { + return *ptr == '\0' ? *(ptr + 1) : '\0'; +} + +void TxtReaderCoreUtf16BE::setAscii(char *ptr, char ascii) { + *(ptr + 1) = ascii; +} diff --git a/reader/src/formats/txt/TxtReader.h b/reader/src/formats/txt/TxtReader.h new file mode 100644 index 0000000..518ba8e --- /dev/null +++ b/reader/src/formats/txt/TxtReader.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TXTREADER_H__ +#define __TXTREADER_H__ + +#include <string> + +#include <ZLEncodingConverter.h> + +#include "../EncodedTextReader.h" + +class ZLInputStream; +class TxtReaderCore; + +class TxtReader : public EncodedTextReader { + +public: + void readDocument(ZLInputStream &stream); + +protected: + TxtReader(const std::string &encoding); + virtual ~TxtReader(); + +protected: + virtual void startDocumentHandler() = 0; + virtual void endDocumentHandler() = 0; + + virtual bool characterDataHandler(std::string &str) = 0; + virtual bool newLineHandler() = 0; + +private: + shared_ptr<TxtReaderCore> myCore; + +friend class TxtReaderCore; +friend class TxtReaderCoreUtf16; +friend class TxtReaderCoreUtf16BE; +}; + +#endif /* __TXTREADER_H__ */ diff --git a/reader/src/formats/util/EntityFilesCollector.cpp b/reader/src/formats/util/EntityFilesCollector.cpp new file mode 100644 index 0000000..075bd29 --- /dev/null +++ b/reader/src/formats/util/EntityFilesCollector.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <ZLStringUtil.h> +#include <ZLibrary.h> +#include <ZLFile.h> +#include <ZLDir.h> + +#include "EntityFilesCollector.h" + +EntityFilesCollector *EntityFilesCollector::ourInstance = 0; + +EntityFilesCollector &EntityFilesCollector::Instance() { + if (ourInstance == 0) { + ourInstance = new EntityFilesCollector(); + } + return *ourInstance; +} + +const std::vector<std::string> &EntityFilesCollector::externalDTDs(const std::string &format) { + std::map<std::string,std::vector<std::string> >::const_iterator it = myCollections.find(format); + if (it != myCollections.end()) { + return it->second; + } + + std::vector<std::string> &collection = myCollections[format]; + + std::string directoryName = + ZLibrary::ApplicationDirectory() + ZLibrary::FileNameDelimiter + + "formats" + ZLibrary::FileNameDelimiter + format; + shared_ptr<ZLDir> dtdPath = ZLFile(directoryName).directory(); + if (!dtdPath.isNull()) { + std::vector<std::string> files; + dtdPath->collectFiles(files, false); + for (std::vector<std::string>::const_iterator it = files.begin(); it != files.end(); ++it) { + if (ZLStringUtil::stringEndsWith(*it, ".ent")) { + collection.push_back(dtdPath->itemPath(*it)); + } + } + } + + return collection; +} + +EntityFilesCollector::EntityFilesCollector() { +} diff --git a/reader/src/formats/util/EntityFilesCollector.h b/reader/src/formats/util/EntityFilesCollector.h new file mode 100644 index 0000000..9967b3d --- /dev/null +++ b/reader/src/formats/util/EntityFilesCollector.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __ENTITYFILESCOLLECTOR_H__ +#define __ENTITYFILESCOLLECTOR_H__ + +#include <map> +#include <vector> +#include <string> + +class EntityFilesCollector { + +public: + static EntityFilesCollector &Instance(); + + const std::vector<std::string> &externalDTDs(const std::string &format); + +private: + EntityFilesCollector(); + +private: + static EntityFilesCollector *ourInstance; + std::map<std::string,std::vector<std::string> > myCollections; +}; + +#endif /* __ENTITYFILESCOLLECTOR_H__ */ diff --git a/reader/src/formats/util/MergedStream.cpp b/reader/src/formats/util/MergedStream.cpp new file mode 100644 index 0000000..1a26a33 --- /dev/null +++ b/reader/src/formats/util/MergedStream.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "MergedStream.h" + +bool MergedStream::open() { + close(); + resetToStart(); + myOffset = 0; + myCurrentStream = nextStream(); + return !myCurrentStream.isNull() && myCurrentStream->open(); +} + +std::size_t MergedStream::read(char *buffer, std::size_t maxSize) { + std::size_t bytesToRead = maxSize; + while ((bytesToRead > 0) && !myCurrentStream.isNull()) { + std::size_t len = myCurrentStream->read(buffer, bytesToRead); + bytesToRead -= len; + if (buffer != 0) { + buffer += len; + } + if (bytesToRead != 0) { + if (buffer != 0) { + *buffer++ = '\n'; + } + bytesToRead--; + myCurrentStream = nextStream(); + if (myCurrentStream.isNull() || !myCurrentStream->open()) { + break; + } + } + } + myOffset += maxSize - bytesToRead; + return maxSize - bytesToRead; +} + +void MergedStream::close() { + myCurrentStream.reset(); +} + +void MergedStream::seek(int offset, bool absoluteOffset) { + // works for nonnegative offsets only + if (absoluteOffset) { + offset -= myOffset; + } + read(0, offset); +} + +std::size_t MergedStream::offset() const { + return myOffset; +} + +std::size_t MergedStream::sizeOfOpened() { + // coudn't be implemented + return 0; +} diff --git a/reader/src/formats/util/MergedStream.h b/reader/src/formats/util/MergedStream.h new file mode 100644 index 0000000..3f982ee --- /dev/null +++ b/reader/src/formats/util/MergedStream.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __MERGEDSTREAM_H__ +#define __MERGEDSTREAM_H__ + +#include <shared_ptr.h> +#include <ZLInputStream.h> + +class MergedStream : public ZLInputStream { + +protected: + virtual shared_ptr<ZLInputStream> nextStream() = 0; + virtual void resetToStart() = 0; + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + shared_ptr<ZLInputStream> myCurrentStream; + std::size_t myOffset; +}; + +#endif /* __MERGEDSTREAM_H__ */ diff --git a/reader/src/formats/util/MiscUtil.cpp b/reader/src/formats/util/MiscUtil.cpp new file mode 100644 index 0000000..1a91406 --- /dev/null +++ b/reader/src/formats/util/MiscUtil.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstdlib> + +#include <ZLApplication.h> +#include <ZLFile.h> +#include <ZLStringUtil.h> + +#include "MiscUtil.h" + +FBTextKind MiscUtil::referenceType(const std::string &link) { + std::string lowerCasedLink = link; + bool isFileReference = + ZLStringUtil::stringStartsWith(lowerCasedLink, "http://") || + ZLStringUtil::stringStartsWith(lowerCasedLink, "https://") || + ZLStringUtil::stringStartsWith(lowerCasedLink, "ftp://"); + if (!isFileReference) { + return ZLStringUtil::stringStartsWith(lowerCasedLink, "mailto:") ? EXTERNAL_HYPERLINK : INTERNAL_HYPERLINK; + } + static const std::string FeedBooksPrefix0 = "http://feedbooks.com/book/stanza/"; + static const std::string FeedBooksPrefix1 = "http://www.feedbooks.com/book/stanza/"; + bool isBookHyperlink = + ZLStringUtil::stringStartsWith(lowerCasedLink, FeedBooksPrefix0) || + ZLStringUtil::stringStartsWith(lowerCasedLink, FeedBooksPrefix1) || + ZLStringUtil::stringEndsWith(lowerCasedLink, ".epub") || + ZLStringUtil::stringEndsWith(lowerCasedLink, ".mobi") || + ZLStringUtil::stringEndsWith(lowerCasedLink, ".chm") || + ZLStringUtil::stringEndsWith(lowerCasedLink, ".fb2"); + return isBookHyperlink ? BOOK_HYPERLINK : EXTERNAL_HYPERLINK; +} + +std::string MiscUtil::htmlDirectoryPrefix(const std::string &fileName) { + ZLFile file(fileName); + std::string shortName = file.name(false); + std::string path = file.path(); + int index = -1; + if ((path.length() > shortName.length()) && + (path[path.length() - shortName.length() - 1] == ':')) { + index = shortName.rfind('/'); + } + return path.substr(0, path.length() - shortName.length() + index + 1); +} + +std::string MiscUtil::htmlFileName(const std::string &fileName) { + ZLFile file(fileName); + std::string shortName = file.name(false); + std::string path = file.path(); + int index = -1; + if ((path.length() > shortName.length()) && + (path[path.length() - shortName.length() - 1] == ':')) { + index = shortName.rfind('/'); + } + return path.substr(path.length() - shortName.length() + index + 1); +} + +std::string MiscUtil::decodeHtmlURL(const std::string &encoded) { + char buffer[3]; + buffer[2] = '\0'; + + std::string decoded; + const int len = encoded.length(); + decoded.reserve(len); + for (int i = 0; i < len; i++) { + if ((encoded[i] == '%') && (i < len - 2)) { + buffer[0] = *(encoded.data() + i + 1); + buffer[1] = *(encoded.data() + i + 2); + decoded += (char)std::strtol(buffer, 0, 16); + i += 2; + } else { + decoded += encoded[i]; + } + } + return decoded; +} diff --git a/reader/src/formats/util/MiscUtil.h b/reader/src/formats/util/MiscUtil.h new file mode 100644 index 0000000..c47d84a --- /dev/null +++ b/reader/src/formats/util/MiscUtil.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __MISCUTIL_H__ +#define __MISCUTIL_H__ + +#include <string> + +#include "../../bookmodel/FBTextKind.h" + +class MiscUtil { + +private: + MiscUtil(); + +public: + static FBTextKind referenceType(const std::string &link); + static std::string htmlDirectoryPrefix(const std::string &fileName); + static std::string htmlFileName(const std::string &fileName); + static std::string decodeHtmlURL(const std::string &encodedURL); +}; + +#endif /* __MISCUTIL_H__ */ diff --git a/reader/src/formats/util/TextFormatDetector.cpp b/reader/src/formats/util/TextFormatDetector.cpp new file mode 100644 index 0000000..4a3ef67 --- /dev/null +++ b/reader/src/formats/util/TextFormatDetector.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cctype> +#include <algorithm> + +#include <ZLInputStream.h> +#include <ZLUnicodeUtil.h> + +#include "TextFormatDetector.h" + +TextFormatDetector::TextFormatDetector() { +} + +TextFormatDetector::~TextFormatDetector() { +} + +bool TextFormatDetector::isHtml(ZLInputStream &stream) const { + if (!stream.open()) { + return false; + } + + const std::size_t bufferSize = 1024; + char *buffer = new char[bufferSize]; + std::string sixBytes; + int valuableBytesCounter = 0; + bool skipFlag = true; + while (valuableBytesCounter < 6) { + std::size_t size = stream.read(buffer, bufferSize); + if (size == 0) { + break; + } + std::size_t index; + for (index = 0; skipFlag && (index < size); ++index) { + if (!std::isspace((unsigned char)buffer[index])) { + skipFlag = false; + break; + } + } + if (!skipFlag && index < size) { + int bytes = std::min(6 - valuableBytesCounter, (int)(size - index)); + sixBytes = std::string(buffer + index, bytes); + valuableBytesCounter += bytes; + } + } + stream.close(); + delete[] buffer; + return ZLUnicodeUtil::toLower(sixBytes) == "<html>"; +} + +bool TextFormatDetector::isPPL(ZLInputStream &stream) const { + if (!stream.open()) { + return false; + } + + char buffer[5]; + bool result = stream.read(buffer, 5) == 5 && std::strncmp(buffer, "PPL\r\n", 5) == 0; + stream.close(); + return result; +} diff --git a/reader/src/formats/util/TextFormatDetector.h b/reader/src/formats/util/TextFormatDetector.h new file mode 100644 index 0000000..c86b90b --- /dev/null +++ b/reader/src/formats/util/TextFormatDetector.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __TEXTFORMATDETECTOR_H__ +#define __TEXTFORMATDETECTOR_H__ + +class ZLInputStream; + +class TextFormatDetector { + +public: + TextFormatDetector(); + ~TextFormatDetector(); + + bool isHtml(ZLInputStream &stream) const; + bool isPPL(ZLInputStream &stream) const; +}; + +#endif /* __TEXTFORMATDETECTOR_H__ */ diff --git a/reader/src/formats/util/XMLTextStream.cpp b/reader/src/formats/util/XMLTextStream.cpp new file mode 100644 index 0000000..19343a1 --- /dev/null +++ b/reader/src/formats/util/XMLTextStream.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> + +#include <ZLXMLReader.h> +#include <ZLUnicodeUtil.h> + +#include <ZLPlainAsynchronousInputStream.h> + +#include "XMLTextStream.h" + +class XMLTextReader : public ZLXMLReader { + +public: + XMLTextReader(std::string &buffer, const std::string &startTag); + +private: + void startElementHandler(const char *tag, const char **attributes); + void characterDataHandler(const char *text, std::size_t len); + +private: + const std::string myStartTag; + std::string &myBuffer; + bool myStarted; +}; + +XMLTextReader::XMLTextReader(std::string &buffer, const std::string &startTag) : myStartTag(ZLUnicodeUtil::toLower(startTag)), myBuffer(buffer), myStarted(myStartTag.empty()) { +} + +void XMLTextReader::startElementHandler(const char *tag, const char**) { + if (!myStarted && (myStartTag == ZLUnicodeUtil::toLower(tag))) { + myStarted = true; + } +} + +void XMLTextReader::characterDataHandler(const char *text, std::size_t len) { + if (myStarted) { + myBuffer.append(text, len); + } +} + +XMLTextStream::XMLTextStream(shared_ptr<ZLInputStream> base, const std::string &startTag) : myBase(base), myStreamBuffer(2048, '\0') { + myReader = new XMLTextReader(myDataBuffer, startTag); +} + +XMLTextStream::~XMLTextStream() { +} + +bool XMLTextStream::open() { + close(); + if (myBase.isNull() || !myBase->open()) { + return false; + } + myStream = new ZLPlainAsynchronousInputStream(); + myOffset = 0; + return true; +} + +std::size_t XMLTextStream::read(char *buffer, std::size_t maxSize) { + while (myDataBuffer.size() < maxSize) { + std::size_t len = myBase->read((char*)myStreamBuffer.data(), 2048); + /*if ((len == 0) || !myReader->readFromBuffer(myStreamBuffer.data(), len)) { + break; + }*/ + if (len == 0) { + break; + } + myStream->setBuffer(myStreamBuffer.data(), len); + if (!myReader->readDocument(myStream)) { + break; + } + } + std::size_t realSize = std::min(myDataBuffer.size(), maxSize); + if (buffer != 0) { + std::memcpy(buffer, myDataBuffer.data(), realSize); + } + myDataBuffer.erase(0, realSize); + myOffset += realSize; + return realSize; +} + +void XMLTextStream::close() { + if (!myStream.isNull()) { + myStream->setEof(); + myReader->readDocument(myStream); + myStream.reset(); + } + myBase->close(); + myDataBuffer.erase(); +} + +void XMLTextStream::seek(int offset, bool absoluteOffset) { + // works for nonnegative offsets only + if (absoluteOffset) { + offset -= myOffset; + } + read(0, offset); +} + +std::size_t XMLTextStream::offset() const { + return myOffset; +} + +std::size_t XMLTextStream::sizeOfOpened() { + // couldn't be implemented + return 0; +} diff --git a/reader/src/formats/util/XMLTextStream.h b/reader/src/formats/util/XMLTextStream.h new file mode 100644 index 0000000..f3151c6 --- /dev/null +++ b/reader/src/formats/util/XMLTextStream.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2008-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __XMLTEXTSTREAM_H__ +#define __XMLTEXTSTREAM_H__ + +#include <shared_ptr.h> +#include <ZLInputStream.h> +#include <ZLAsynchronousInputStream.h> + +class XMLTextReader; + +class XMLTextStream : public ZLInputStream { + +public: + XMLTextStream(shared_ptr<ZLInputStream> base, const std::string &startTag); + ~XMLTextStream(); + +private: + bool open(); + std::size_t read(char *buffer, std::size_t maxSize); + void close(); + void seek(int offset, bool absoluteOffset); + std::size_t offset() const; + std::size_t sizeOfOpened(); + +private: + shared_ptr<ZLInputStream> myBase; + shared_ptr<XMLTextReader> myReader; + shared_ptr<ZLAsynchronousInputStream> myStream; + std::string myStreamBuffer; + std::string myDataBuffer; + std::size_t myOffset; +}; + +#endif /* __XMLTEXTSTREAM_H__ */ diff --git a/reader/src/formats/xhtml/XHTMLReader.cpp b/reader/src/formats/xhtml/XHTMLReader.cpp new file mode 100644 index 0000000..6e4ba59 --- /dev/null +++ b/reader/src/formats/xhtml/XHTMLReader.cpp @@ -0,0 +1,715 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <cstring> +#include <cctype> + +#include <ZLFile.h> +#include <ZLFileUtil.h> +#include <ZLFileImage.h> +#include <ZLUnicodeUtil.h> +#include <ZLStringUtil.h> +#include <ZLXMLNamespace.h> +#include <ZLInputStream.h> +#include <ZLLogger.h> + +#include "XHTMLReader.h" +#include "../util/EntityFilesCollector.h" +#include "../util/MiscUtil.h" +#include "../css/StyleSheetParser.h" + +#include "../../bookmodel/BookReader.h" +#include "../../bookmodel/BookModel.h" + +std::map<std::string,XHTMLTagAction*> XHTMLReader::ourTagActions; + +XHTMLTagAction::~XHTMLTagAction() { +} + +BookReader &XHTMLTagAction::bookReader(XHTMLReader &reader) { + return reader.myModelReader; +} + +const std::string &XHTMLTagAction::pathPrefix(XHTMLReader &reader) { + return reader.myPathPrefix; +} + +void XHTMLTagAction::beginParagraph(XHTMLReader &reader) { + reader.beginParagraph(); +} + +void XHTMLTagAction::endParagraph(XHTMLReader &reader) { + reader.endParagraph(); +} + +class XHTMLTagStyleAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagLinkAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagParagraphAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagBodyAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagRestartParagraphAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagImageAction : public XHTMLTagAction { + +public: + XHTMLTagImageAction(shared_ptr<ZLXMLReader::AttributeNamePredicate> predicate); + XHTMLTagImageAction(const std::string &attributeName); + + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); + +private: + shared_ptr<ZLXMLReader::AttributeNamePredicate> myPredicate; +}; + +class XHTMLSvgImageAttributeNamePredicate : public ZLXMLReader::NamespaceAttributeNamePredicate { + +public: + XHTMLSvgImageAttributeNamePredicate(); + bool accepts(const ZLXMLReader &reader, const char *name) const; + +private: + bool myIsEnabled; + +friend class XHTMLTagSvgAction; +}; + +class XHTMLTagSvgAction : public XHTMLTagAction { + +public: + XHTMLTagSvgAction(XHTMLSvgImageAttributeNamePredicate &predicate); + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); + +private: + XHTMLSvgImageAttributeNamePredicate &myPredicate; +}; + +class XHTMLTagItemAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +class XHTMLTagHyperlinkAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); + +private: + std::stack<FBTextKind> myHyperlinkStack; +}; + +class XHTMLTagControlAction : public XHTMLTagAction { + +public: + XHTMLTagControlAction(FBTextKind control); + + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); + +private: + FBTextKind myControl; +}; + +class XHTMLTagParagraphWithControlAction : public XHTMLTagAction { + +public: + XHTMLTagParagraphWithControlAction(FBTextKind control); + + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); + +private: + FBTextKind myControl; +}; + +class XHTMLTagPreAction : public XHTMLTagAction { + +public: + void doAtStart(XHTMLReader &reader, const char **xmlattributes); + void doAtEnd(XHTMLReader &reader); +}; + +void XHTMLTagStyleAction::doAtStart(XHTMLReader &reader, const char **xmlattributes) { + static const std::string TYPE = "text/css"; + + const char *type = reader.attributeValue(xmlattributes, "type"); + if ((type == 0) || (TYPE != type)) { + return; + } + + if (reader.myReadState == XHTMLReader::READ_NOTHING) { + reader.myReadState = XHTMLReader::READ_STYLE; + reader.myTableParser = new StyleSheetTableParser(reader.myStyleSheetTable); + ZLLogger::Instance().println("CSS", "parsing style tag content"); + } +} + +void XHTMLTagStyleAction::doAtEnd(XHTMLReader &reader) { + if (reader.myReadState == XHTMLReader::READ_STYLE) { + reader.myReadState = XHTMLReader::READ_NOTHING; + reader.myTableParser.reset(); + } +} + +void XHTMLTagLinkAction::doAtStart(XHTMLReader &reader, const char **xmlattributes) { + static const std::string REL = "stylesheet"; + const char *rel = reader.attributeValue(xmlattributes, "rel"); + if ((rel == 0) || (REL != rel)) { + return; + } + static const std::string TYPE = "text/css"; + + const char *type = reader.attributeValue(xmlattributes, "type"); + if ((type == 0) || (TYPE != type)) { + return; + } + + const char *href = reader.attributeValue(xmlattributes, "href"); + if (href == 0) { + return; + } + + ZLLogger::Instance().println("CSS", "style file: " + reader.myPathPrefix + MiscUtil::decodeHtmlURL(href)); + shared_ptr<ZLInputStream> cssStream = ZLFile(reader.myPathPrefix + MiscUtil::decodeHtmlURL(href)).inputStream(); + if (cssStream.isNull()) { + return; + } + ZLLogger::Instance().println("CSS", "parsing file"); + StyleSheetTableParser parser(reader.myStyleSheetTable); + parser.parse(*cssStream); + //reader.myStyleSheetTable.dump(); +} + +void XHTMLTagLinkAction::doAtEnd(XHTMLReader&) { +} + +void XHTMLTagParagraphAction::doAtStart(XHTMLReader &reader, const char**) { + if (!reader.myNewParagraphInProgress) { + beginParagraph(reader); + reader.myNewParagraphInProgress = true; + } +} + +void XHTMLTagParagraphAction::doAtEnd(XHTMLReader &reader) { + endParagraph(reader); +} + +void XHTMLTagBodyAction::doAtStart(XHTMLReader &reader, const char**) { + reader.myReadState = XHTMLReader::READ_BODY; +} + +void XHTMLTagBodyAction::doAtEnd(XHTMLReader &reader) { + endParagraph(reader); + reader.myReadState = XHTMLReader::READ_NOTHING; +} + +void XHTMLTagRestartParagraphAction::doAtStart(XHTMLReader &reader, const char**) { + if (reader.myCurrentParagraphIsEmpty) { + bookReader(reader).addData(" "); + } + endParagraph(reader); + beginParagraph(reader); +} + +void XHTMLTagRestartParagraphAction::doAtEnd(XHTMLReader&) { +} + +void XHTMLTagItemAction::doAtStart(XHTMLReader &reader, const char**) { + endParagraph(reader); + // TODO: increase left indent + beginParagraph(reader); + // TODO: replace bullet sign by number inside OL tag + const std::string bullet = "\xE2\x80\xA2\xC0\xA0"; + bookReader(reader).addData(bullet); +} + +void XHTMLTagItemAction::doAtEnd(XHTMLReader &reader) { + endParagraph(reader); +} + +XHTMLTagImageAction::XHTMLTagImageAction(shared_ptr<ZLXMLReader::AttributeNamePredicate> predicate) { + myPredicate = predicate; +} + +XHTMLTagImageAction::XHTMLTagImageAction(const std::string &attributeName) { + myPredicate = new ZLXMLReader::FixedAttributeNamePredicate(attributeName); +} + +void XHTMLTagImageAction::doAtStart(XHTMLReader &reader, const char **xmlattributes) { + const char *fileName = reader.attributeValue(xmlattributes, *myPredicate); + if (fileName == 0) { + return; + } + + const std::string fullfileName = pathPrefix(reader) + MiscUtil::decodeHtmlURL(fileName); + ZLFile imageFile(fullfileName); + if (!imageFile.exists()) { + return; + } + + bool flag = bookReader(reader).paragraphIsOpen(); + if (flag) { + endParagraph(reader); + } + if (std::strlen(fileName) > 2 && std::strncmp(fileName, "./", 2) == 0) { + fileName +=2; + } + bookReader(reader).addImageReference(fullfileName); + bookReader(reader).addImage(fullfileName, new ZLFileImage(ZLFile(fullfileName), 0)); + if (flag) { + beginParagraph(reader); + } +} + +XHTMLTagSvgAction::XHTMLTagSvgAction(XHTMLSvgImageAttributeNamePredicate &predicate) : myPredicate(predicate) { +} + +void XHTMLTagSvgAction::doAtStart(XHTMLReader&, const char**) { + myPredicate.myIsEnabled = true; +} + +void XHTMLTagSvgAction::doAtEnd(XHTMLReader&) { + myPredicate.myIsEnabled = false; +} + +XHTMLSvgImageAttributeNamePredicate::XHTMLSvgImageAttributeNamePredicate() : ZLXMLReader::NamespaceAttributeNamePredicate(ZLXMLNamespace::XLink, "href"), myIsEnabled(false) { +} + +bool XHTMLSvgImageAttributeNamePredicate::accepts(const ZLXMLReader &reader, const char *name) const { + return myIsEnabled && NamespaceAttributeNamePredicate::accepts(reader, name); +} + +void XHTMLTagImageAction::doAtEnd(XHTMLReader&) { +} + +XHTMLTagControlAction::XHTMLTagControlAction(FBTextKind control) : myControl(control) { +} + +void XHTMLTagControlAction::doAtStart(XHTMLReader &reader, const char**) { + bookReader(reader).pushKind(myControl); + bookReader(reader).addControl(myControl, true); +} + +void XHTMLTagControlAction::doAtEnd(XHTMLReader &reader) { + bookReader(reader).addControl(myControl, false); + bookReader(reader).popKind(); +} + +void XHTMLTagHyperlinkAction::doAtStart(XHTMLReader &reader, const char **xmlattributes) { + const char *href = reader.attributeValue(xmlattributes, "href"); + if (href != 0 && href[0] != '\0') { + const FBTextKind hyperlinkType = MiscUtil::referenceType(href); + std::string link = MiscUtil::decodeHtmlURL(href); + if (hyperlinkType == INTERNAL_HYPERLINK) { + if (link[0] == '#') { + link = reader.myReferenceAlias + link; + } else { + link = reader.normalizedReference(reader.myReferenceDirName + link); + } + } + myHyperlinkStack.push(hyperlinkType); + bookReader(reader).addHyperlinkControl(hyperlinkType, link); + } else { + myHyperlinkStack.push(REGULAR); + } + const char *name = reader.attributeValue(xmlattributes, "name"); + if (name != 0) { + bookReader(reader).addHyperlinkLabel( + reader.myReferenceAlias + "#" + MiscUtil::decodeHtmlURL(name) + ); + } +} + +void XHTMLTagHyperlinkAction::doAtEnd(XHTMLReader &reader) { + FBTextKind kind = myHyperlinkStack.top(); + if (kind != REGULAR) { + bookReader(reader).addControl(kind, false); + } + myHyperlinkStack.pop(); +} + +XHTMLTagParagraphWithControlAction::XHTMLTagParagraphWithControlAction(FBTextKind control) : myControl(control) { +} + +void XHTMLTagParagraphWithControlAction::doAtStart(XHTMLReader &reader, const char**) { + if (myControl == TITLE && bookReader(reader).model().bookTextModel()->paragraphsNumber() > 1) { + bookReader(reader).insertEndOfSectionParagraph(); + } + bookReader(reader).pushKind(myControl); + beginParagraph(reader); +} + +void XHTMLTagParagraphWithControlAction::doAtEnd(XHTMLReader &reader) { + endParagraph(reader); + bookReader(reader).popKind(); +} + +void XHTMLTagPreAction::doAtStart(XHTMLReader &reader, const char**) { + reader.myPreformatted = true; + beginParagraph(reader); + bookReader(reader).addControl(PREFORMATTED, true); +} + +void XHTMLTagPreAction::doAtEnd(XHTMLReader &reader) { + endParagraph(reader); + reader.myPreformatted = false; +} + +XHTMLTagAction *XHTMLReader::addAction(const std::string &tag, XHTMLTagAction *action) { + XHTMLTagAction *old = ourTagActions[tag]; + ourTagActions[tag] = action; + return old; +} + +void XHTMLReader::fillTagTable() { + if (ourTagActions.empty()) { + //addAction("html", new XHTMLTagAction()); + addAction("body", new XHTMLTagBodyAction()); + //addAction("title", new XHTMLTagAction()); + //addAction("meta", new XHTMLTagAction()); + //addAction("script", new XHTMLTagAction()); + + //addAction("font", new XHTMLTagAction()); + addAction("style", new XHTMLTagStyleAction()); + + addAction("p", new XHTMLTagParagraphAction()); + addAction("h1", new XHTMLTagParagraphWithControlAction(H1)); + addAction("h2", new XHTMLTagParagraphWithControlAction(H2)); + addAction("h3", new XHTMLTagParagraphWithControlAction(H3)); + addAction("h4", new XHTMLTagParagraphWithControlAction(H4)); + addAction("h5", new XHTMLTagParagraphWithControlAction(H5)); + addAction("h6", new XHTMLTagParagraphWithControlAction(H6)); + + //addAction("ol", new XHTMLTagAction()); + //addAction("ul", new XHTMLTagAction()); + //addAction("dl", new XHTMLTagAction()); + addAction("li", new XHTMLTagItemAction()); + + addAction("strong", new XHTMLTagControlAction(STRONG)); + addAction("b", new XHTMLTagControlAction(BOLD)); + addAction("em", new XHTMLTagControlAction(EMPHASIS)); + addAction("i", new XHTMLTagControlAction(ITALIC)); + addAction("code", new XHTMLTagControlAction(CODE)); + addAction("tt", new XHTMLTagControlAction(CODE)); + addAction("kbd", new XHTMLTagControlAction(CODE)); + addAction("var", new XHTMLTagControlAction(CODE)); + addAction("samp", new XHTMLTagControlAction(CODE)); + addAction("cite", new XHTMLTagControlAction(CITE)); + addAction("sub", new XHTMLTagControlAction(SUB)); + addAction("sup", new XHTMLTagControlAction(SUP)); + addAction("dd", new XHTMLTagControlAction(DEFINITION_DESCRIPTION)); + addAction("dfn", new XHTMLTagControlAction(DEFINITION)); + addAction("strike", new XHTMLTagControlAction(STRIKETHROUGH)); + + addAction("a", new XHTMLTagHyperlinkAction()); + + addAction("img", new XHTMLTagImageAction("src")); + addAction("object", new XHTMLTagImageAction("data")); + XHTMLSvgImageAttributeNamePredicate *predicate = new XHTMLSvgImageAttributeNamePredicate(); + addAction("image", new XHTMLTagImageAction(predicate)); + addAction("svg", new XHTMLTagSvgAction(*predicate)); + + //addAction("area", new XHTMLTagAction()); + //addAction("map", new XHTMLTagAction()); + + //addAction("base", new XHTMLTagAction()); + //addAction("blockquote", new XHTMLTagAction()); + addAction("br", new XHTMLTagRestartParagraphAction()); + //addAction("center", new XHTMLTagAction()); + addAction("div", new XHTMLTagParagraphAction()); + addAction("dt", new XHTMLTagParagraphAction()); + //addAction("head", new XHTMLTagAction()); + //addAction("hr", new XHTMLTagAction()); + addAction("link", new XHTMLTagLinkAction()); + //addAction("param", new XHTMLTagAction()); + //addAction("q", new XHTMLTagAction()); + //addAction("s", new XHTMLTagAction()); + + addAction("pre", new XHTMLTagPreAction()); + //addAction("big", new XHTMLTagAction()); + //addAction("small", new XHTMLTagAction()); + //addAction("u", new XHTMLTagAction()); + + //addAction("table", new XHTMLTagAction()); + addAction("td", new XHTMLTagParagraphAction()); + addAction("th", new XHTMLTagParagraphAction()); + //addAction("tr", new XHTMLTagAction()); + //addAction("caption", new XHTMLTagAction()); + //addAction("span", new XHTMLTagAction()); + } +} + +XHTMLReader::XHTMLReader(BookReader &modelReader) : myModelReader(modelReader) { +} + +bool XHTMLReader::readFile(const ZLFile &file, const std::string &referenceName) { + fillTagTable(); + + myPathPrefix = MiscUtil::htmlDirectoryPrefix(file.path()); + myReferenceAlias = fileAlias(referenceName); + myModelReader.addHyperlinkLabel(myReferenceAlias); + + const int index = referenceName.rfind('/', referenceName.length() - 1); + myReferenceDirName = referenceName.substr(0, index + 1); + + myPreformatted = false; + myNewParagraphInProgress = false; + myReadState = READ_NOTHING; + myCurrentParagraphIsEmpty = true; + + myStyleSheetTable.clear(); + myCSSStack.clear(); + myStyleEntryStack.clear(); + myStylesToRemove = 0; + + myDoPageBreakAfterStack.clear(); + myStyleParser = new StyleSheetSingleStyleParser(); + myTableParser.reset(); + + return readDocument(file); +} + +bool XHTMLReader::addStyleEntry(const std::string tag, const std::string aClass) { + shared_ptr<ZLTextStyleEntry> entry = myStyleSheetTable.control(tag, aClass); + if (!entry.isNull()) { + myModelReader.addStyleEntry(*entry); + myStyleEntryStack.push_back(entry); + return true; + } + return false; +} + +void XHTMLReader::startElementHandler(const char *tag, const char **attributes) { + static const std::string HASH = "#"; + const char *id = attributeValue(attributes, "id"); + if (id != 0) { + myModelReader.addHyperlinkLabel(myReferenceAlias + HASH + id); + } + + const std::string sTag = ZLUnicodeUtil::toLower(tag); + + const char *aClass = attributeValue(attributes, "class"); + const std::string sClass = (aClass != 0) ? aClass : ""; + + if (myStyleSheetTable.doBreakBefore(sTag, sClass)) { + myModelReader.insertEndOfSectionParagraph(); + } + myDoPageBreakAfterStack.push_back(myStyleSheetTable.doBreakAfter(sTag, sClass)); + + XHTMLTagAction *action = ourTagActions[sTag]; + if (action != 0) { + action->doAtStart(*this, attributes); + } + + const int sizeBefore = myStyleEntryStack.size(); + addStyleEntry(sTag, ""); + addStyleEntry("", sClass); + addStyleEntry(sTag, sClass); + const char *style = attributeValue(attributes, "style"); + if (style != 0) { + ZLLogger::Instance().println("CSS", std::string("parsing style attribute: ") + style); + shared_ptr<ZLTextStyleEntry> entry = myStyleParser->parseString(style); + myModelReader.addStyleEntry(*entry); + myStyleEntryStack.push_back(entry); + } else { + } + myCSSStack.push_back(myStyleEntryStack.size() - sizeBefore); +} + +void XHTMLReader::endElementHandler(const char *tag) { + for (int i = myCSSStack.back(); i > 0; --i) { + myModelReader.addStyleCloseEntry(); + } + myStylesToRemove = myCSSStack.back(); + myCSSStack.pop_back(); + + XHTMLTagAction *action = ourTagActions[ZLUnicodeUtil::toLower(tag)]; + if (action != 0) { + action->doAtEnd(*this); + myNewParagraphInProgress = false; + } + + for (; myStylesToRemove > 0; --myStylesToRemove) { + myStyleEntryStack.pop_back(); + } + + if (myDoPageBreakAfterStack.back()) { + myModelReader.insertEndOfSectionParagraph(); + } + myDoPageBreakAfterStack.pop_back(); +} + +void XHTMLReader::beginParagraph() { + myCurrentParagraphIsEmpty = true; + myModelReader.beginParagraph(); + bool doBlockSpaceBefore = false; + for (std::vector<shared_ptr<ZLTextStyleEntry> >::const_iterator it = myStyleEntryStack.begin(); it != myStyleEntryStack.end(); ++it) { + myModelReader.addStyleEntry(**it); + doBlockSpaceBefore = + doBlockSpaceBefore || + (*it)->isFeatureSupported(ZLTextStyleEntry::LENGTH_SPACE_BEFORE); + } + + if (doBlockSpaceBefore) { + ZLTextStyleEntry blockingEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + blockingEntry.setLength( + ZLTextStyleEntry::LENGTH_SPACE_BEFORE, + 0, + ZLTextStyleEntry::SIZE_UNIT_PIXEL + ); + myModelReader.addStyleEntry(blockingEntry); + } +} + +void XHTMLReader::endParagraph() { + bool doBlockSpaceAfter = false; + for (std::vector<shared_ptr<ZLTextStyleEntry> >::const_iterator it = myStyleEntryStack.begin(); it != myStyleEntryStack.end() - myStylesToRemove; ++it) { + doBlockSpaceAfter = + doBlockSpaceAfter || + (*it)->isFeatureSupported(ZLTextStyleEntry::LENGTH_SPACE_AFTER); + } + if (doBlockSpaceAfter) { + ZLTextStyleEntry blockingEntry(ZLTextStyleEntry::STYLE_OTHER_ENTRY); + blockingEntry.setLength( + ZLTextStyleEntry::LENGTH_SPACE_AFTER, + 0, + ZLTextStyleEntry::SIZE_UNIT_PIXEL + ); + myModelReader.addStyleEntry(blockingEntry); + } + for (; myStylesToRemove > 0; --myStylesToRemove) { + myModelReader.addStyleEntry(*myStyleEntryStack.back()); + myStyleEntryStack.pop_back(); + } + myModelReader.endParagraph(); +} + +void XHTMLReader::characterDataHandler(const char *text, std::size_t len) { + switch (myReadState) { + case READ_NOTHING: + break; + case READ_STYLE: + if (!myTableParser.isNull()) { + myTableParser->parse(text, len); + } + break; + case READ_BODY: + if (myPreformatted) { + if (*text == '\r' || *text == '\n') { + endParagraph(); + text += 1; + len -= 1; + beginParagraph(); + myModelReader.addControl(PREFORMATTED, true); + } + std::size_t spaceCounter = 0; + while (spaceCounter < len && std::isspace((unsigned char)*(text + spaceCounter))) { + ++spaceCounter; + } + myModelReader.addFixedHSpace(spaceCounter); + text += spaceCounter; + len -= spaceCounter; + } else if (myNewParagraphInProgress || !myModelReader.paragraphIsOpen()) { + while (std::isspace((unsigned char)*text)) { + ++text; + if (--len == 0) { + break; + } + } + } + if (len > 0) { + myCurrentParagraphIsEmpty = false; + if (!myModelReader.paragraphIsOpen()) { + myModelReader.beginParagraph(); + } + myModelReader.addData(std::string(text, len)); + myNewParagraphInProgress = false; + } + break; + } +} + +const std::vector<std::string> &XHTMLReader::externalDTDs() const { + return EntityFilesCollector::Instance().externalDTDs("xhtml"); +} + +bool XHTMLReader::processNamespaces() const { + return true; +} + +const std::string XHTMLReader::normalizedReference(const std::string &reference) const { + const std::size_t index = reference.find('#'); + if (index == std::string::npos) { + return fileAlias(reference); + } else { + return fileAlias(reference.substr(0, index)) + reference.substr(index); + } +} + +const std::string &XHTMLReader::fileAlias(const std::string &fileName) const { + std::map<std::string,std::string>::const_iterator it = myFileNumbers.find(fileName); + if (it != myFileNumbers.end()) { + return it->second; + } + + const std::string correctedFileName = + ZLFileUtil::normalizeUnixPath(MiscUtil::decodeHtmlURL(fileName)); + it = myFileNumbers.find(correctedFileName); + if (it != myFileNumbers.end()) { + return it->second; + } + + std::string num; + ZLStringUtil::appendNumber(num, myFileNumbers.size()); + myFileNumbers.insert(std::make_pair(correctedFileName, num)); + it = myFileNumbers.find(correctedFileName); + return it->second; +} diff --git a/reader/src/formats/xhtml/XHTMLReader.h b/reader/src/formats/xhtml/XHTMLReader.h new file mode 100644 index 0000000..08d4c02 --- /dev/null +++ b/reader/src/formats/xhtml/XHTMLReader.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2004-2012 Geometer Plus <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef __XHTMLREADER_H__ +#define __XHTMLREADER_H__ + +#include <string> +#include <map> +#include <vector> + +#include <ZLXMLReader.h> + +#include "../css/StyleSheetTable.h" +#include "../css/StyleSheetParser.h" + +class ZLFile; + +class BookReader; +class XHTMLReader; + +class XHTMLTagAction { + +public: + virtual ~XHTMLTagAction(); + + virtual void doAtStart(XHTMLReader &reader, const char **xmlattributes) = 0; + virtual void doAtEnd(XHTMLReader &reader) = 0; + +protected: + static BookReader &bookReader(XHTMLReader &reader); + static const std::string &pathPrefix(XHTMLReader &reader); + static void beginParagraph(XHTMLReader &reader); + static void endParagraph(XHTMLReader &reader); +}; + +class XHTMLReader : public ZLXMLReader { + +public: + static XHTMLTagAction *addAction(const std::string &tag, XHTMLTagAction *action); + static void fillTagTable(); + +private: + static std::map<std::string,XHTMLTagAction*> ourTagActions; + +public: + XHTMLReader(BookReader &modelReader); + bool readFile(const ZLFile &file, const std::string &referenceName); + const std::string &fileAlias(const std::string &fileName) const; + const std::string normalizedReference(const std::string &reference) const; + +private: + void startElementHandler(const char *tag, const char **attributes); + void endElementHandler(const char *tag); + void characterDataHandler(const char *text, std::size_t len); + + const std::vector<std::string> &externalDTDs() const; + + bool processNamespaces() const; + + void beginParagraph(); + void endParagraph(); + bool addStyleEntry(const std::string tag, const std::string aClass); + +private: + mutable std::map<std::string,std::string> myFileNumbers; + + BookReader &myModelReader; + std::string myPathPrefix; + std::string myReferenceAlias; + std::string myReferenceDirName; + bool myPreformatted; + bool myNewParagraphInProgress; + StyleSheetTable myStyleSheetTable; + std::vector<int> myCSSStack; + std::vector<shared_ptr<ZLTextStyleEntry> > myStyleEntryStack; + int myStylesToRemove; + std::vector<bool> myDoPageBreakAfterStack; + bool myCurrentParagraphIsEmpty; + shared_ptr<StyleSheetSingleStyleParser> myStyleParser; + shared_ptr<StyleSheetTableParser> myTableParser; + enum { + READ_NOTHING, + READ_STYLE, + READ_BODY + } myReadState; + + friend class XHTMLTagAction; + friend class XHTMLTagStyleAction; + friend class XHTMLTagLinkAction; + friend class XHTMLTagHyperlinkAction; + friend class XHTMLTagPreAction; + friend class XHTMLTagParagraphAction; + friend class XHTMLTagBodyAction; + friend class XHTMLTagRestartParagraphAction; +}; + +#endif /* __XHTMLREADER_H__ */ |