diff options
Diffstat (limited to 'lib/libchmfile/libchmfileimpl.cpp')
-rw-r--r-- | lib/libchmfile/libchmfileimpl.cpp | 1271 |
1 files changed, 1271 insertions, 0 deletions
diff --git a/lib/libchmfile/libchmfileimpl.cpp b/lib/libchmfile/libchmfileimpl.cpp new file mode 100644 index 0000000..6f1e052 --- /dev/null +++ b/lib/libchmfile/libchmfileimpl.cpp @@ -0,0 +1,1271 @@ +/*************************************************************************** + * Copyright (C) 2004-2007 by Georgy Yunaev, [email protected] * + * Portions Copyright (C) 2003 Razvan Cojocaru <[email protected]> * + * Please do not use email address above for bug reports; see * + * the README file * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include <sys/types.h> + +#include <qcursor.h> +#include <qfile.h> +#include <qapplication.h> + +#include "config.h" +#include "chm_lib.h" +#include "bitfiddle.h" +#include "libchmfile.h" +#include "libchmurlfactory.h" +#include "libchmfileimpl.h" + +// Big-enough buffer size for use with various routines. +#define BUF_SIZE 4096 +#define COMMON_BUF_LEN 1025 + +#define TOPICS_ENTRY_LEN 16 +#define URLTBL_ENTRY_LEN 12 + +//#define DEBUGPARSER(A) qDebug A +#define DEBUGPARSER(A) ; + +class KCHMShowWaitCursor +{ + public: + KCHMShowWaitCursor () { QApplication::setOverrideCursor( QCursor(Qt::WaitCursor) ); } + ~KCHMShowWaitCursor () { QApplication::restoreOverrideCursor(); } +}; + + +LCHMFileImpl::LCHMFileImpl( ) +{ + m_chmFile = NULL; + m_home = m_filename = m_home = m_topicsFile = m_indexFile = m_font = QString::null; + m_entityDecodeMap.clear(); + m_textCodec = 0; + m_textCodecForSpecialFiles = 0; + m_detectedLCID = 0; + m_currentEncoding = 0; +} + + +LCHMFileImpl::~ LCHMFileImpl( ) +{ + closeAll(); +} + + +bool LCHMFileImpl::loadFile( const QString & archiveName ) +{ + QString filename; + + // If the file has a file:// prefix, remove it + if ( archiveName.startsWith( "file://" ) ) + filename = archiveName.mid( 7 ); // strip it + else + filename = archiveName; + + if( m_chmFile ) + closeAll(); + + m_chmFile = chm_open( QFile::encodeName(filename) ); + + if ( m_chmFile == NULL ) + return false; + + m_filename = filename; + + // Reset encoding + m_textCodec = 0; + m_textCodecForSpecialFiles = 0; + m_currentEncoding = 0; + + // Get information from /#WINDOWS and /#SYSTEM files (encoding, title, context file and so) + // and guess the encoding + getInfoFromWindows(); + getInfoFromSystem(); + guessTextEncoding(); + + // Check whether the search tables are present + if ( ResolveObject("/#TOPICS", &m_chmTOPICS) + && ResolveObject("/#STRINGS", &m_chmSTRINGS) + && ResolveObject("/#URLTBL", &m_chmURLTBL) + && ResolveObject("/#URLSTR", &m_chmURLSTR) ) + { + m_lookupTablesValid = true; + fillTopicsUrlMap(); + } + else + m_lookupTablesValid = false; + + if ( m_lookupTablesValid && ResolveObject ("/$FIftiMain", &m_chmFIftiMain) ) + m_searchAvailable = true; + else + m_searchAvailable = false; + + return true; +} + + +void LCHMFileImpl::closeAll( ) +{ + if ( m_chmFile == NULL ) + return; + + chm_close( m_chmFile ); + + m_chmFile = NULL; + m_home = m_filename = m_home = m_topicsFile = m_indexFile = m_font = QString::null; + m_entityDecodeMap.clear(); + m_textCodec = 0; + m_textCodecForSpecialFiles = 0; + m_detectedLCID = 0; + m_currentEncoding = 0; +} + + +QString LCHMFileImpl::decodeEntity( const QString & entity ) +{ + // Set up m_entityDecodeMap characters according to current textCodec + if ( m_entityDecodeMap.isEmpty() ) + { + m_entityDecodeMap["AElig"] = encodeWithCurrentCodec ("\306"); // capital AE diphthong (ligature) + m_entityDecodeMap["Aacute"] = encodeWithCurrentCodec ("\301"); // capital A, acute accent + m_entityDecodeMap["Acirc"] = encodeWithCurrentCodec ("\302"); // capital A, circumflex accent + m_entityDecodeMap["Agrave"] = encodeWithCurrentCodec ("\300"); // capital A, grave accent + m_entityDecodeMap["Aring"] = encodeWithCurrentCodec ("\305"); // capital A, ring + m_entityDecodeMap["Atilde"] = encodeWithCurrentCodec ("\303"); // capital A, tilde + m_entityDecodeMap["Auml"] = encodeWithCurrentCodec ("\304"); // capital A, dieresis or umlaut mark + m_entityDecodeMap["Ccedil"] = encodeWithCurrentCodec ("\307"); // capital C, cedilla + m_entityDecodeMap["Dstrok"] = encodeWithCurrentCodec ("\320"); // whatever + m_entityDecodeMap["ETH"] = encodeWithCurrentCodec ("\320"); // capital Eth, Icelandic + m_entityDecodeMap["Eacute"] = encodeWithCurrentCodec ("\311"); // capital E, acute accent + m_entityDecodeMap["Ecirc"] = encodeWithCurrentCodec ("\312"); // capital E, circumflex accent + m_entityDecodeMap["Egrave"] = encodeWithCurrentCodec ("\310"); // capital E, grave accent + m_entityDecodeMap["Euml"] = encodeWithCurrentCodec ("\313"); // capital E, dieresis or umlaut mark + m_entityDecodeMap["Iacute"] = encodeWithCurrentCodec ("\315"); // capital I, acute accent + m_entityDecodeMap["Icirc"] = encodeWithCurrentCodec ("\316"); // capital I, circumflex accent + m_entityDecodeMap["Igrave"] = encodeWithCurrentCodec ("\314"); // capital I, grave accent + m_entityDecodeMap["Iuml"] = encodeWithCurrentCodec ("\317"); // capital I, dieresis or umlaut mark + m_entityDecodeMap["Ntilde"] = encodeWithCurrentCodec ("\321"); // capital N, tilde + m_entityDecodeMap["Oacute"] = encodeWithCurrentCodec ("\323"); // capital O, acute accent + m_entityDecodeMap["Ocirc"] = encodeWithCurrentCodec ("\324"); // capital O, circumflex accent + m_entityDecodeMap["Ograve"] = encodeWithCurrentCodec ("\322"); // capital O, grave accent + m_entityDecodeMap["Oslash"] = encodeWithCurrentCodec ("\330"); // capital O, slash + m_entityDecodeMap["Otilde"] = encodeWithCurrentCodec ("\325"); // capital O, tilde + m_entityDecodeMap["Ouml"] = encodeWithCurrentCodec ("\326"); // capital O, dieresis or umlaut mark + m_entityDecodeMap["THORN"] = encodeWithCurrentCodec ("\336"); // capital THORN, Icelandic + m_entityDecodeMap["Uacute"] = encodeWithCurrentCodec ("\332"); // capital U, acute accent + m_entityDecodeMap["Ucirc"] = encodeWithCurrentCodec ("\333"); // capital U, circumflex accent + m_entityDecodeMap["Ugrave"] = encodeWithCurrentCodec ("\331"); // capital U, grave accent + m_entityDecodeMap["Uuml"] = encodeWithCurrentCodec ("\334"); // capital U, dieresis or umlaut mark + m_entityDecodeMap["Yacute"] = encodeWithCurrentCodec ("\335"); // capital Y, acute accent + m_entityDecodeMap["OElig"] = encodeWithCurrentCodec ("\338"); // capital Y, acute accent + m_entityDecodeMap["oelig"] = encodeWithCurrentCodec ("\339"); // capital Y, acute accent + + m_entityDecodeMap["aacute"] = encodeWithCurrentCodec ("\341"); // small a, acute accent + m_entityDecodeMap["acirc"] = encodeWithCurrentCodec ("\342"); // small a, circumflex accent + m_entityDecodeMap["aelig"] = encodeWithCurrentCodec ("\346"); // small ae diphthong (ligature) + m_entityDecodeMap["agrave"] = encodeWithCurrentCodec ("\340"); // small a, grave accent + m_entityDecodeMap["aring"] = encodeWithCurrentCodec ("\345"); // small a, ring + m_entityDecodeMap["atilde"] = encodeWithCurrentCodec ("\343"); // small a, tilde + m_entityDecodeMap["auml"] = encodeWithCurrentCodec ("\344"); // small a, dieresis or umlaut mark + m_entityDecodeMap["ccedil"] = encodeWithCurrentCodec ("\347"); // small c, cedilla + m_entityDecodeMap["eacute"] = encodeWithCurrentCodec ("\351"); // small e, acute accent + m_entityDecodeMap["ecirc"] = encodeWithCurrentCodec ("\352"); // small e, circumflex accent + m_entityDecodeMap["Scaron"] = encodeWithCurrentCodec ("\352"); // small e, circumflex accent + m_entityDecodeMap["egrave"] = encodeWithCurrentCodec ("\350"); // small e, grave accent + m_entityDecodeMap["eth"] = encodeWithCurrentCodec ("\360"); // small eth, Icelandic + m_entityDecodeMap["euml"] = encodeWithCurrentCodec ("\353"); // small e, dieresis or umlaut mark + m_entityDecodeMap["iacute"] = encodeWithCurrentCodec ("\355"); // small i, acute accent + m_entityDecodeMap["icirc"] = encodeWithCurrentCodec ("\356"); // small i, circumflex accent + m_entityDecodeMap["igrave"] = encodeWithCurrentCodec ("\354"); // small i, grave accent + m_entityDecodeMap["iuml"] = encodeWithCurrentCodec ("\357"); // small i, dieresis or umlaut mark + m_entityDecodeMap["ntilde"] = encodeWithCurrentCodec ("\361"); // small n, tilde + m_entityDecodeMap["oacute"] = encodeWithCurrentCodec ("\363"); // small o, acute accent + m_entityDecodeMap["ocirc"] = encodeWithCurrentCodec ("\364"); // small o, circumflex accent + m_entityDecodeMap["ograve"] = encodeWithCurrentCodec ("\362"); // small o, grave accent + m_entityDecodeMap["oslash"] = encodeWithCurrentCodec ("\370"); // small o, slash + m_entityDecodeMap["otilde"] = encodeWithCurrentCodec ("\365"); // small o, tilde + m_entityDecodeMap["ouml"] = encodeWithCurrentCodec ("\366"); // small o, dieresis or umlaut mark + m_entityDecodeMap["szlig"] = encodeWithCurrentCodec ("\337"); // small sharp s, German (sz ligature) + m_entityDecodeMap["thorn"] = encodeWithCurrentCodec ("\376"); // small thorn, Icelandic + m_entityDecodeMap["uacute"] = encodeWithCurrentCodec ("\372"); // small u, acute accent + m_entityDecodeMap["ucirc"] = encodeWithCurrentCodec ("\373"); // small u, circumflex accent + m_entityDecodeMap["ugrave"] = encodeWithCurrentCodec ("\371"); // small u, grave accent + m_entityDecodeMap["uuml"] = encodeWithCurrentCodec ("\374"); // small u, dieresis or umlaut mark + m_entityDecodeMap["yacute"] = encodeWithCurrentCodec ("\375"); // small y, acute accent + m_entityDecodeMap["yuml"] = encodeWithCurrentCodec ("\377"); // small y, dieresis or umlaut mark + + m_entityDecodeMap["iexcl"] = encodeWithCurrentCodec ("\241"); + m_entityDecodeMap["cent"] = encodeWithCurrentCodec ("\242"); + m_entityDecodeMap["pound"] = encodeWithCurrentCodec ("\243"); + m_entityDecodeMap["curren"] = encodeWithCurrentCodec ("\244"); + m_entityDecodeMap["yen"] = encodeWithCurrentCodec ("\245"); + m_entityDecodeMap["brvbar"] = encodeWithCurrentCodec ("\246"); + m_entityDecodeMap["sect"] = encodeWithCurrentCodec ("\247"); + m_entityDecodeMap["uml"] = encodeWithCurrentCodec ("\250"); + m_entityDecodeMap["ordf"] = encodeWithCurrentCodec ("\252"); + m_entityDecodeMap["laquo"] = encodeWithCurrentCodec ("\253"); + m_entityDecodeMap["not"] = encodeWithCurrentCodec ("\254"); + m_entityDecodeMap["shy"] = encodeWithCurrentCodec ("\255"); + m_entityDecodeMap["macr"] = encodeWithCurrentCodec ("\257"); + m_entityDecodeMap["deg"] = encodeWithCurrentCodec ("\260"); + m_entityDecodeMap["plusmn"] = encodeWithCurrentCodec ("\261"); + m_entityDecodeMap["sup1"] = encodeWithCurrentCodec ("\271"); + m_entityDecodeMap["sup2"] = encodeWithCurrentCodec ("\262"); + m_entityDecodeMap["sup3"] = encodeWithCurrentCodec ("\263"); + m_entityDecodeMap["acute"] = encodeWithCurrentCodec ("\264"); + m_entityDecodeMap["micro"] = encodeWithCurrentCodec ("\265"); + m_entityDecodeMap["para"] = encodeWithCurrentCodec ("\266"); + m_entityDecodeMap["middot"] = encodeWithCurrentCodec ("\267"); + m_entityDecodeMap["cedil"] = encodeWithCurrentCodec ("\270"); + m_entityDecodeMap["ordm"] = encodeWithCurrentCodec ("\272"); + m_entityDecodeMap["raquo"] = encodeWithCurrentCodec ("\273"); + m_entityDecodeMap["frac14"] = encodeWithCurrentCodec ("\274"); + m_entityDecodeMap["frac12"] = encodeWithCurrentCodec ("\275"); + m_entityDecodeMap["frac34"] = encodeWithCurrentCodec ("\276"); + m_entityDecodeMap["iquest"] = encodeWithCurrentCodec ("\277"); + m_entityDecodeMap["times"] = encodeWithCurrentCodec ("\327"); + m_entityDecodeMap["divide"] = encodeWithCurrentCodec ("\367"); + + m_entityDecodeMap["copy"] = encodeWithCurrentCodec ("\251"); // copyright sign + m_entityDecodeMap["reg"] = encodeWithCurrentCodec ("\256"); // registered sign + m_entityDecodeMap["nbsp"] = encodeWithCurrentCodec ("\240"); // non breaking space + + m_entityDecodeMap["fnof"] = QChar((unsigned short) 402); + + m_entityDecodeMap["Delta"] = QChar((unsigned short) 916); + m_entityDecodeMap["Pi"] = QChar((unsigned short) 928); + m_entityDecodeMap["Sigma"] = QChar((unsigned short) 931); + + m_entityDecodeMap["beta"] = QChar((unsigned short) 946); + m_entityDecodeMap["gamma"] = QChar((unsigned short) 947); + m_entityDecodeMap["delta"] = QChar((unsigned short) 948); + m_entityDecodeMap["eta"] = QChar((unsigned short) 951); + m_entityDecodeMap["theta"] = QChar((unsigned short) 952); + m_entityDecodeMap["lambda"] = QChar((unsigned short) 955); + m_entityDecodeMap["mu"] = QChar((unsigned short) 956); + m_entityDecodeMap["nu"] = QChar((unsigned short) 957); + m_entityDecodeMap["pi"] = QChar((unsigned short) 960); + m_entityDecodeMap["rho"] = QChar((unsigned short) 961); + + m_entityDecodeMap["lsquo"] = QChar((unsigned short) 8216); + m_entityDecodeMap["rsquo"] = QChar((unsigned short) 8217); + m_entityDecodeMap["rdquo"] = QChar((unsigned short) 8221); + m_entityDecodeMap["bdquo"] = QChar((unsigned short) 8222); + m_entityDecodeMap["trade"] = QChar((unsigned short) 8482); + m_entityDecodeMap["ldquo"] = QChar((unsigned short) 8220); + m_entityDecodeMap["ndash"] = QChar((unsigned short) 8211); + m_entityDecodeMap["mdash"] = QChar((unsigned short) 8212); + m_entityDecodeMap["bull"] = QChar((unsigned short) 8226); + m_entityDecodeMap["hellip"] = QChar((unsigned short) 8230); + m_entityDecodeMap["emsp"] = QChar((unsigned short) 8195); + m_entityDecodeMap["rarr"] = QChar((unsigned short) 8594); + m_entityDecodeMap["rArr"] = QChar((unsigned short) 8658); + m_entityDecodeMap["crarr"] = QChar((unsigned short) 8629); + m_entityDecodeMap["le"] = QChar((unsigned short) 8804); + m_entityDecodeMap["ge"] = QChar((unsigned short) 8805); + m_entityDecodeMap["lte"] = QChar((unsigned short) 8804); // wrong, but used somewhere + m_entityDecodeMap["gte"] = QChar((unsigned short) 8805); // wrong, but used somewhere + m_entityDecodeMap["dagger"] = QChar((unsigned short) 8224); + m_entityDecodeMap["Dagger"] = QChar((unsigned short) 8225); + m_entityDecodeMap["euro"] = QChar((unsigned short) 8364); + m_entityDecodeMap["asymp"] = QChar((unsigned short) 8776); + m_entityDecodeMap["isin"] = QChar((unsigned short) 8712); + m_entityDecodeMap["notin"] = QChar((unsigned short) 8713); + m_entityDecodeMap["prod"] = QChar((unsigned short) 8719); + m_entityDecodeMap["ne"] = QChar((unsigned short) 8800); + + m_entityDecodeMap["amp"] = "&"; // ampersand + m_entityDecodeMap["gt"] = ">"; // greater than + m_entityDecodeMap["lt"] = "<"; // less than + m_entityDecodeMap["quot"] = "\""; // double quote + m_entityDecodeMap["apos"] = "'"; // single quote + m_entityDecodeMap["frasl"] = "/"; + m_entityDecodeMap["minus"] = "-"; + m_entityDecodeMap["oplus"] = "+"; + m_entityDecodeMap["Prime"] = "\""; + } + + // If entity is an ASCII code like 〽 - just decode it + if ( entity[0] == '#' ) + { + bool valid; + unsigned int ascode = entity.mid(1).toUInt( &valid ); + + if ( !valid ) + { + qWarning ( "LCHMFileImpl::decodeEntity: could not decode HTML entity '%s'", entity.ascii() ); + return QString::null; + } + + return (QString) (QChar( ascode )); + } + else + { + QMap<QString, QString>::const_iterator it = m_entityDecodeMap.find( entity ); + + if ( it == m_entityDecodeMap.end() ) + { + qWarning ("LCHMFileImpl::decodeEntity: could not decode HTML entity '%s'", entity.ascii()); + return QString::null; + } + + return *it; + } +} + + +inline int LCHMFileImpl::findStringInQuotes (const QString& tag, int offset, QString& value, bool firstquote, bool decodeentities) +{ + int qbegin = tag.find ('"', offset); + + if ( qbegin == -1 ) + qFatal ("LCHMFileImpl::findStringInQuotes: cannot find first quote in <param> tag: '%s'", tag.ascii()); + + int qend = firstquote ? tag.find ('"', qbegin + 1) : tag.findRev ('"'); + + if ( qend == -1 || qend <= qbegin ) + qFatal ("LCHMFileImpl::findStringInQuotes: cannot find last quote in <param> tag: '%s'", tag.ascii()); + + // If we do not need to decode HTML entities, just return. + if ( decodeentities ) + { + QString htmlentity = QString::null; + bool fill_entity = false; + + value.reserve (qend - qbegin); // to avoid multiple memory allocations + + for ( int i = qbegin + 1; i < qend; i++ ) + { + if ( !fill_entity ) + { + if ( tag[i] == '&' ) // HTML entity starts + fill_entity = true; + else + value.append (tag[i]); + } + else + { + if ( tag[i] == ';' ) // HTML entity ends + { + // If entity is an ASCII code, just decode it + QString decode = decodeEntity( htmlentity ); + + if ( decode.isNull() ) + break; + + value.append ( decode ); + htmlentity = QString::null; + fill_entity = false; + } + else + htmlentity.append (tag[i]); + } + } + } + else + value = tag.mid (qbegin + 1, qend - qbegin - 1); + + return qend + 1; +} + + +bool LCHMFileImpl::searchWord (const QString& text, + bool wholeWords, + bool titlesOnly, + LCHMSearchProgressResults& results, + bool phrase_search) +{ + bool partial = false; + + if ( text.isEmpty() || !m_searchAvailable ) + return false; + + QString searchword = (QString) convertSearchWord (text); + +#define FTS_HEADER_LEN 0x32 + unsigned char header[FTS_HEADER_LEN]; + + if ( RetrieveObject (&m_chmFIftiMain, header, 0, FTS_HEADER_LEN) == 0 ) + return false; + + unsigned char doc_index_s = header[0x1E], doc_index_r = header[0x1F]; + unsigned char code_count_s = header[0x20], code_count_r = header[0x21]; + unsigned char loc_codes_s = header[0x22], loc_codes_r = header[0x23]; + + if(doc_index_s != 2 || code_count_s != 2 || loc_codes_s != 2) + { + // Don't know how to use values other than 2 yet. Maybe next chmspec. + return false; + } + + unsigned char* cursor32 = header + 0x14; + u_int32_t node_offset = UINT32ARRAY(cursor32); + + cursor32 = header + 0x2e; + u_int32_t node_len = UINT32ARRAY(cursor32); + + unsigned char* cursor16 = header + 0x18; + u_int16_t tree_depth = UINT16ARRAY(cursor16); + + unsigned char word_len, pos; + QString word; + u_int32_t i = sizeof(u_int16_t); + u_int16_t free_space; + + QMemArray<unsigned char> buffer(node_len); + + node_offset = GetLeafNodeOffset (searchword, node_offset, node_len, tree_depth); + + if ( !node_offset ) + return false; + + do + { + // got a leaf node here. + if ( RetrieveObject (&m_chmFIftiMain, buffer.data(), node_offset, node_len) == 0 ) + return false; + + cursor16 = buffer.data() + 6; + free_space = UINT16ARRAY(cursor16); + + i = sizeof(u_int32_t) + sizeof(u_int16_t) + sizeof(u_int16_t); + u_int64_t wlc_count, wlc_size; + u_int32_t wlc_offset; + + while (i < node_len - free_space) + { + word_len = *(buffer.data() + i); + pos = *(buffer.data() + i + 1); + + char *wrd_buf = new char[word_len]; + memcpy (wrd_buf, buffer.data() + i + 2, word_len - 1); + wrd_buf[word_len - 1] = 0; + + if ( pos == 0 ) + word = wrd_buf; + else + word = word.mid (0, pos) + wrd_buf; + + delete[] wrd_buf; + + i += 2 + word_len; + unsigned char title = *(buffer.data() + i - 1); + + size_t encsz; + wlc_count = be_encint (buffer.data() + i, encsz); + i += encsz; + + cursor32 = buffer.data() + i; + wlc_offset = UINT32ARRAY(cursor32); + + i += sizeof(u_int32_t) + sizeof(u_int16_t); + wlc_size = be_encint (buffer.data() + i, encsz); + i += encsz; + + cursor32 = buffer.data(); + node_offset = UINT32ARRAY(cursor32); + + if ( !title && titlesOnly ) + continue; + + if ( wholeWords && searchword == word ) + return ProcessWLC(wlc_count, wlc_size, + wlc_offset, doc_index_s, + doc_index_r,code_count_s, + code_count_r, loc_codes_s, + loc_codes_r, results, phrase_search); + + if ( !wholeWords ) + { + if ( word.startsWith (searchword)) + { + partial = true; + + ProcessWLC(wlc_count, wlc_size, + wlc_offset, doc_index_s, + doc_index_r,code_count_s, + code_count_r, loc_codes_s, + loc_codes_r, results, phrase_search); + + } + else if ( QString::compare (searchword, word.mid(0, searchword.length())) < -1 ) + break; + } + } + } + while ( !wholeWords && word.startsWith (searchword) && node_offset ); + + return partial; +} + + +bool LCHMFileImpl::ResolveObject(const QString& fileName, chmUnitInfo *ui) const +{ + return m_chmFile != NULL + && ::chm_resolve_object(m_chmFile, fileName.ascii(), ui) == + CHM_RESOLVE_SUCCESS; +} + + +size_t LCHMFileImpl::RetrieveObject(const chmUnitInfo *ui, unsigned char *buffer, + LONGUINT64 fileOffset, LONGINT64 bufferSize) const +{ +#if USE_BUILTIN_CHMLIB + return ::chm_retrieve_object(m_chmFile, ui, buffer, + fileOffset, bufferSize); +#else + return ::chm_retrieve_object(m_chmFile, const_cast<chmUnitInfo*>(ui), + buffer, fileOffset, bufferSize); +#endif +} + + +inline u_int32_t LCHMFileImpl::GetLeafNodeOffset(const QString& text, + u_int32_t initialOffset, + u_int32_t buffSize, + u_int16_t treeDepth) +{ + u_int32_t test_offset = 0; + unsigned char* cursor16, *cursor32; + unsigned char word_len, pos; + u_int32_t i = sizeof(u_int16_t); + QMemArray<unsigned char> buffer(buffSize); + QString word; + + while(--treeDepth) + { + if ( initialOffset == test_offset ) + return 0; + + test_offset = initialOffset; + if ( RetrieveObject (&m_chmFIftiMain, buffer.data(), initialOffset, buffSize) == 0 ) + return 0; + + cursor16 = buffer.data(); + u_int16_t free_space = UINT16ARRAY(cursor16); + + while (i < buffSize - free_space ) + { + word_len = *(buffer.data() + i); + pos = *(buffer.data() + i + 1); + + char *wrd_buf = new char[word_len]; + memcpy ( wrd_buf, buffer.data() + i + 2, word_len - 1 ); + wrd_buf[word_len - 1] = 0; + + if ( pos == 0 ) + word = wrd_buf; + else + word = word.mid(0, pos) + wrd_buf; + + delete[] wrd_buf; + + if ( text <= word ) + { + cursor32 = buffer.data() + i + word_len + 1; + initialOffset = UINT32ARRAY(cursor32); + break; + } + + i += word_len + sizeof(unsigned char) + + sizeof(u_int32_t) + sizeof(u_int16_t); + } + } + + if ( initialOffset == test_offset ) + return 0; + + return initialOffset; +} + + +inline bool LCHMFileImpl::ProcessWLC (u_int64_t wlc_count, u_int64_t wlc_size, + u_int32_t wlc_offset, unsigned char ds, + unsigned char dr, unsigned char cs, + unsigned char cr, unsigned char ls, + unsigned char lr, + LCHMSearchProgressResults& results, + bool phrase_search) +{ + int wlc_bit = 7; + u_int64_t index = 0, count; + size_t length, off = 0; + QMemArray<unsigned char> buffer (wlc_size); + unsigned char *cursor32; + + unsigned char entry[TOPICS_ENTRY_LEN]; + unsigned char combuf[13]; + + if ( RetrieveObject (&m_chmFIftiMain, buffer.data(), wlc_offset, wlc_size) == 0 ) + return false; + + for ( u_int64_t i = 0; i < wlc_count; ++i ) + { + if ( wlc_bit != 7 ) + { + ++off; + wlc_bit = 7; + } + + index += sr_int (buffer.data() + off, &wlc_bit, ds, dr, length); + off += length; + + if ( RetrieveObject (&m_chmTOPICS, entry, index * 16, TOPICS_ENTRY_LEN) == 0 ) + return false; + + LCHMSearchProgressResult progres; + + cursor32 = entry + 4; + progres.titleoff = UINT32ARRAY(cursor32); + + cursor32 = entry + 8; + progres.urloff = UINT32ARRAY(cursor32); + + if ( RetrieveObject (&m_chmURLTBL, combuf, progres.urloff, 12) == 0 ) + return false; + + cursor32 = combuf + 8; + progres.urloff = UINT32ARRAY (cursor32); + + count = sr_int (buffer.data() + off, &wlc_bit, cs, cr, length); + off += length; + + if ( phrase_search ) + progres.offsets.reserve (count); + + for (u_int64_t j = 0; j < count; ++j) + { + u_int64_t lcode = sr_int (buffer.data() + off, &wlc_bit, ls, lr, length); + off += length; + + if ( phrase_search ) + progres.offsets.push_back (lcode); + } + + results.push_back (progres); + } + + return true; +} + + +bool LCHMFileImpl::getInfoFromWindows() +{ +#define WIN_HEADER_LEN 0x08 + unsigned char buffer[BUF_SIZE]; + unsigned int factor; + chmUnitInfo ui; + long size = 0; + + if ( ResolveObject("/#WINDOWS", &ui) ) + { + if ( !RetrieveObject(&ui, buffer, 0, WIN_HEADER_LEN) ) + return false; + + u_int32_t entries = get_int32_le( (u_int32_t *)(buffer) ); + u_int32_t entry_size = get_int32_le( (u_int32_t *)(buffer + 0x04) ); + + QByteArray uptr(entries * entry_size); + unsigned char* raw = (unsigned char*) uptr.data(); + + if ( !RetrieveObject (&ui, raw, 8, entries * entry_size) ) + return false; + + if( !ResolveObject ("/#STRINGS", &ui) ) + return false; + + for ( u_int32_t i = 0; i < entries; ++i ) + { + u_int32_t offset = i * entry_size; + + u_int32_t off_title = get_int32_le( (u_int32_t *)(raw + offset + 0x14) ); + u_int32_t off_home = get_int32_le( (u_int32_t *)(raw + offset + 0x68) ); + u_int32_t off_hhc = get_int32_le( (u_int32_t *)(raw + offset + 0x60) ); + u_int32_t off_hhk = get_int32_le( (u_int32_t *)(raw + offset + 0x64) ); + + factor = off_title / 4096; + + if ( size == 0 ) + size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); + + if ( size && off_title ) + m_title = QString ((const char*) (buffer + off_title % 4096)); + + if ( factor != off_home / 4096) + { + factor = off_home / 4096; + size = RetrieveObject (&ui, buffer, factor * 4096, BUF_SIZE); + } + + if ( size && off_home ) + m_home = QString("/") + QString( (const char*) buffer + off_home % 4096); + + if ( factor != off_hhc / 4096) + { + factor = off_hhc / 4096; + size = RetrieveObject(&ui, buffer, factor * 4096, BUF_SIZE); + } + + if ( size && off_hhc ) + m_topicsFile = QString("/") + QString ((const char*) buffer + off_hhc % 4096); + + if ( factor != off_hhk / 4096) + { + factor = off_hhk / 4096; + size = RetrieveObject (&ui, buffer, factor * 4096, BUF_SIZE); + } + + if ( size && off_hhk ) + m_indexFile = QString("/") + QString((const char*) buffer + off_hhk % 4096); + } + } + return true; +} + + + +bool LCHMFileImpl::getInfoFromSystem() +{ + unsigned char buffer[BUF_SIZE]; + chmUnitInfo ui; + + int index = 0; + unsigned char* cursor = NULL, *p; + u_int16_t value = 0; + long size = 0; + + // Run the first loop to detect the encoding. We need this, because title could be + // already encoded in user encoding. Same for file names + if ( !ResolveObject ("/#SYSTEM", &ui) ) + return false; + + // Can we pull BUFF_SIZE bytes of the #SYSTEM file? + if ( (size = RetrieveObject (&ui, buffer, 4, BUF_SIZE)) == 0 ) + return false; + + buffer[size - 1] = 0; + + // First loop to detect the encoding + for ( index = 0; index < (size - 1 - (long)sizeof(u_int16_t)) ;) + { + cursor = buffer + index; + value = UINT16ARRAY(cursor); + + switch(value) + { + case 0: + index += 2; + cursor = buffer + index; + + if(m_topicsFile.isEmpty()) + m_topicsFile = QString("/") + QString((const char*) buffer + index + 2); + + break; + + case 1: + index += 2; + cursor = buffer + index; + + if(m_indexFile.isEmpty()) + m_indexFile = QString("/") + QString ((const char*)buffer + index + 2); + break; + + case 2: + index += 2; + cursor = buffer + index; + + if(m_home.isEmpty() || m_home == "/") + m_home = QString("/") + QString ((const char*) buffer + index + 2); + break; + + case 3: + index += 2; + cursor = buffer + index; + m_title = QString((const char*) (buffer + index + 2)); + break; + + case 4: + index += 2; + cursor = buffer + index; + + p = buffer + index + 2; + m_detectedLCID = (short) (p[0] | (p[1]<<8)); + + break; + + case 6: + index += 2; + cursor = buffer + index; + + if(m_topicsFile.isEmpty()) { + QString topicAttempt = "/", tmp; + topicAttempt += QString ((const char*) buffer +index +2); + + tmp = topicAttempt + ".hhc"; + + if ( ResolveObject( tmp, &ui) ) + m_topicsFile = tmp; + + tmp = topicAttempt + ".hhk"; + + if ( ResolveObject( tmp, &ui) ) + m_indexFile = tmp; + } + break; + + case 16: + index += 2; + cursor = buffer + index; + + m_font = QString ((const char*) buffer + index + 2); + break; + + default: + index += 2; + cursor = buffer + index; + } + + value = UINT16ARRAY(cursor); + index += value + 2; + } + + return true; +} + + +QCString LCHMFileImpl::convertSearchWord( const QString & src ) +{ + static const char * searchwordtable[128] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "s", 0, "oe", 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "s", 0, "oe", 0, 0, "y", + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + "a", "a", "a", "a", "a", "a", "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i", + "d", "n", "o", "o", "o", "o", "o", 0, "o", "u", "u", "u", "u", "y", "\xDE", "ss", + "a", "a", "a", "a", "a", "a", "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i", + "o", "n", "o", "o", "o", "o", "o", 0, "o", "u", "u", "u", "u", "y", "\xFE", "y" + }; + + if ( !m_textCodec ) + return (QCString) src.lower(); + + QCString dest = m_textCodec->fromUnicode (src); + + for ( unsigned int i = 0; i < dest.size(); i++ ) + { + if ( dest[i] & 0x80 ) + { + int index = dest[i] & 0x7F; + if ( searchwordtable[index] ) + dest.replace (i, 1, searchwordtable[index]); + else + dest.remove (i, 1); + } + } + + return dest.lower(); +} + + + +void LCHMFileImpl::getSearchResults( const LCHMSearchProgressResults& tempres, + QStringList * results, + unsigned int limit_results ) +{ + unsigned char combuf [COMMON_BUF_LEN]; + QMap<u_int32_t, u_int32_t> urlsmap; // used to prevent duplicated urls + + for ( unsigned int i = 0; i < tempres.size(); i++ ) + { + if ( urlsmap.find (tempres[i].urloff) != urlsmap.end() ) + continue; + + urlsmap[tempres[i].urloff] = 1; + + if ( RetrieveObject (&m_chmURLSTR, combuf, tempres[i].urloff + 8, COMMON_BUF_LEN - 1) == 0 ) + continue; + + combuf[COMMON_BUF_LEN - 1] = 0; + results->push_back( LCHMUrlFactory::makeURLabsoluteIfNeeded( (const char*) combuf ) ); + + if ( --limit_results == 0 ) + break; + } +} + + +QString LCHMFileImpl::normalizeUrl( const QString & path ) const +{ + int pos = path.find ('#'); + QString fixedpath = pos == -1 ? path : path.left (pos); + + return LCHMUrlFactory::makeURLabsoluteIfNeeded( fixedpath ); +} + + +/* + * FIXME: <OBJECT type="text/sitemap"><param name="Merge" value="hhaxref.chm::/HHOCX_c.hhc"></OBJECT> + * (from htmlhelp.chm) +*/ +bool LCHMFileImpl::parseFileAndFillArray( const QString & file, QT34VECTOR< LCHMParsedEntry > * data, bool asIndex ) +{ + QString src; + const int MAX_NEST_DEPTH = 256; + + if ( !getFileContentAsString( &src, file ) || src.isEmpty() ) + return false; + + KCHMShowWaitCursor wc; + +/* + // Save the index for debugging purposes + QFile outfile( "parsed.htm" ); + + if ( outfile.open( IO_WriteOnly ) ) + { + QTextStream textstream( &outfile ); + textstream << src; + outfile.close(); + } +*/ + + unsigned int defaultimagenum = asIndex ? LCHMBookIcons::IMAGE_INDEX : LCHMBookIcons::IMAGE_AUTO; + int pos = 0, indent = 0, root_indent_offset = 0; + bool in_object = false, root_indent_offset_set = false; + + LCHMParsedEntry entry; + entry.imageid = defaultimagenum; + + // Split the HHC file by HTML tags + int stringlen = src.length(); + + while ( pos < stringlen && (pos = src.find ('<', pos)) != -1 ) + { + int i, word_end = 0; + + for ( i = ++pos; i < stringlen; i++ ) + { + // If a " or ' is found, skip to the next one. + if ( (src[i] == '"' || src[i] == '\'') ) + { + // find where quote ends, either by another quote, or by '>' symbol (some people don't know HTML) + int nextpos = src.find (src[i], i+1); + if ( nextpos == -1 && (nextpos = src.find ('>', i+1)) == -1 ) + { + qWarning ("LCHMFileImpl::ParseHhcAndFillTree: corrupted TOC: %s", src.mid(i).ascii()); + return false; + } + + i = nextpos; + } + else if ( src[i] == '>' ) + break; + else if ( !src[i].isLetterOrNumber() && src[i] != '/' && !word_end ) + word_end = i; + } + + QString tagword, tag = src.mid (pos, i - pos); + + if ( word_end ) + tagword = src.mid (pos, word_end - pos).lower(); + else + tagword = tag.lower(); + + //qDebug ("tag: '%s', tagword: '%s'\n", tag.ascii(), tagword.ascii()); + + // <OBJECT type="text/sitemap"> - a topic entry + if ( tagword == "object" && tag.find ("text/sitemap", 0, false) != -1 ) + in_object = true; + else if ( tagword == "/object" && in_object ) + { + // a topic entry closed. Add a tree item + if ( entry.name ) + { + if ( !root_indent_offset_set ) + { + root_indent_offset_set = true; + root_indent_offset = indent; + + if ( root_indent_offset > 1 ) + qWarning("CHM has improper index; root indent offset is %d", root_indent_offset); + } + + int real_indent = indent - root_indent_offset; + + entry.indent = real_indent; + data->push_back( entry ); + } + else + { + if ( !entry.urls.isEmpty() ) + qDebug ("LCHMFileImpl::ParseAndFillTopicsTree: <object> tag with url \"%s\" is parsed, but name is empty.", entry.urls[0].ascii()); + else + qDebug ("LCHMFileImpl::ParseAndFillTopicsTree: <object> tag is parsed, but both name and url are empty."); + } + + entry.name = QString::null; + entry.urls.clear(); + entry.imageid = defaultimagenum; + in_object = false; + } + else if ( tagword == "param" && in_object ) + { + // <param name="Name" value="First Page"> + int offset; // strlen("param ") + QString name_pattern = "name=", value_pattern = "value="; + QString pname, pvalue; + + if ( (offset = tag.find (name_pattern, 0, FALSE)) == -1 ) + qFatal ("LCHMFileImpl::ParseAndFillTopicsTree: bad <param> tag '%s': no name=\n", tag.ascii()); + + // offset+5 skips 'name=' + offset = findStringInQuotes (tag, offset + name_pattern.length(), pname, TRUE, FALSE); + pname = pname.lower(); + + if ( (offset = tag.find (value_pattern, offset, FALSE)) == -1 ) + qFatal ("LCHMFileImpl::ParseAndFillTopicsTree: bad <param> tag '%s': no value=\n", tag.ascii()); + + // offset+6 skips 'value=' + findStringInQuotes (tag, offset + value_pattern.length(), pvalue, FALSE, TRUE); + + //qDebug ("<param>: name '%s', value '%s'", pname.ascii(), pvalue.ascii()); + + if ( pname == "name" ) + { + // Some help files contain duplicate names, where the second name is empty. Work it around by keeping the first one + if ( !pvalue.isEmpty() ) + entry.name = pvalue; + } + else if ( pname == "local" ) + { + // Check for URL duplication + QString url = LCHMUrlFactory::makeURLabsoluteIfNeeded( pvalue ); + + if ( entry.urls.find( url ) == entry.urls.end() ) + entry.urls.push_back( url ); + } + else if ( pname == "see also" && asIndex && entry.name != pvalue ) + entry.urls.push_back (":" + pvalue); + else if ( pname == "imagenumber" ) + { + bool bok; + int imgnum = pvalue.toInt (&bok); + + if ( bok && imgnum >= 0 && imgnum < LCHMBookIcons::MAX_BUILTIN_ICONS ) + entry.imageid = imgnum; + } + } + else if ( tagword == "ul" ) // increase indent level + { + // Fix for buggy help files + if ( ++indent >= MAX_NEST_DEPTH ) + qFatal("LCHMFileImpl::ParseAndFillTopicsTree: max nest depth (%d) is reached, error in help file", MAX_NEST_DEPTH); + + // This intended to fix <ul><ul>, which was seen in some buggy chm files, + // and brokes rootentry[indent-1] check + } + else if ( tagword == "/ul" ) // decrease indent level + { + if ( --indent < root_indent_offset ) + indent = root_indent_offset; + + DEBUGPARSER(("</ul>: new intent is %d\n", indent - root_indent_offset)); + } + + pos = i; + } + + return true; +} + + +bool LCHMFileImpl::getFileContentAsBinary( QByteArray * data, const QString & url ) const +{ + chmUnitInfo ui; + + if( !ResolveObject( url, &ui ) ) + return false; + + data->resize( ui.length ); + + if ( RetrieveObject( &ui, (unsigned char*) data->data(), 0, ui.length ) ) + return true; + else + return false; +} + + +bool LCHMFileImpl::getFileContentAsString( QString * str, const QString & url, bool internal_encoding ) +{ + QByteArray buf; + + if ( getFileContentAsBinary( &buf, url ) ) + { + unsigned int length = buf.size(); + + if ( length > 0 ) + { + buf.resize( length + 1 ); + buf [length] = '\0'; + + *str = internal_encoding ? (QString)((const char*) buf) : encodeWithCurrentCodec((const char*) buf); + return true; + } + } + + return false; +} + + +QString LCHMFileImpl::getTopicByUrl( const QString & url ) const +{ + QMap< QString, QString >::const_iterator it = m_url2topics.find( url ); + + if ( it == m_url2topics.end() ) + return QString::null; + + return it.data(); +} + + +static int chm_enumerator_callback( struct chmFile*, struct chmUnitInfo *ui, void *context ) +{ + ((QStringList*) context)->push_back( ui->path ); + return CHM_ENUMERATOR_CONTINUE; +} + +bool LCHMFileImpl::enumerateFiles( QStringList * files ) +{ + files->clear(); + return chm_enumerate( m_chmFile, CHM_ENUMERATE_ALL, chm_enumerator_callback, files ); +} + +const QPixmap * LCHMFileImpl::getBookIconPixmap( unsigned int imagenum ) +{ + return m_imagesKeeper.getImage( imagenum ); +} + +bool LCHMFileImpl::setCurrentEncoding( const LCHMTextEncoding * encoding ) +{ + m_currentEncoding = encoding; + return changeFileEncoding( encoding->qtcodec ); +} + + +bool LCHMFileImpl::guessTextEncoding( ) +{ + const LCHMTextEncoding * enc = 0; + + if ( !m_detectedLCID || (enc = lookupByLCID (m_detectedLCID)) == 0 ) + qFatal ("Could not detect text encoding by LCID"); + + if ( changeFileEncoding (enc->qtcodec) ) + { + m_currentEncoding = enc; + return true; + } + + return false; +} + +bool LCHMFileImpl::changeFileEncoding( const char *qtencoding ) +{ + // Encoding could be either simple Qt codepage, or set like CP1251/KOI8, which allows to + // set up encodings separately for text (first) and internal files (second) + const char * p = strchr( qtencoding, '/' ); + if ( p ) + { + char buf[128]; // much bigger that any encoding possible. No DoS; all encodings are hardcoded. + strcpy( buf, qtencoding ); + buf[p - qtencoding] = '\0'; + + m_textCodec = QTextCodec::codecForName( buf ); + + if ( !m_textCodec ) + { + qWarning( "Could not set up Text Codec for encoding '%s'", buf ); + return false; + } + + m_textCodecForSpecialFiles = QTextCodec::codecForName( p + 1 ); + + if ( !m_textCodecForSpecialFiles ) + { + qWarning( "Could not set up Text Codec for encoding '%s'", p + 1 ); + return false; + } + } + else + { + m_textCodecForSpecialFiles = m_textCodec = QTextCodec::codecForName (qtencoding); + + if ( !m_textCodec ) + { + qWarning( "Could not set up Text Codec for encoding '%s'", qtencoding ); + return false; + } + } + + m_entityDecodeMap.clear(); + return true; +} + + +void LCHMFileImpl::fillTopicsUrlMap() +{ + if ( !m_lookupTablesValid ) + return; + + // Read those tables + QByteArray topics( m_chmTOPICS.length ), urltbl( m_chmURLTBL.length ), urlstr( m_chmURLSTR.length ), strings( m_chmSTRINGS.length ); + + if ( !RetrieveObject( &m_chmTOPICS, (unsigned char*) topics.data(), 0, m_chmTOPICS.length ) + || !RetrieveObject( &m_chmURLTBL, (unsigned char*) urltbl.data(), 0, m_chmURLTBL.length ) + || !RetrieveObject( &m_chmURLSTR, (unsigned char*) urlstr.data(), 0, m_chmURLSTR.length ) + || !RetrieveObject( &m_chmSTRINGS, (unsigned char*) strings.data(), 0, m_chmSTRINGS.length ) ) + return; + + for ( unsigned int i = 0; i < m_chmTOPICS.length; i += TOPICS_ENTRY_LEN ) + { + u_int32_t off_title = get_int32_le( (u_int32_t *)(topics.data() + i + 4) ); + u_int32_t off_url = get_int32_le( (u_int32_t *)(topics.data() + i + 8) ); + off_url = get_int32_le( (u_int32_t *)( urltbl.data() + off_url + 8) ) + 8; + + QString url = LCHMUrlFactory::makeURLabsoluteIfNeeded( (const char*) urlstr.data() + off_url ); + + if ( off_title < strings.size() ) + m_url2topics[url] = encodeWithCurrentCodec ( (const char*) strings.data() + off_title ); + else + m_url2topics[url] = "Untitled"; + } +} + + +bool LCHMFileImpl::getFileSize(unsigned int * size, const QString & url) +{ + chmUnitInfo ui; + + if( !ResolveObject( url, &ui ) ) + return false; + + *size = ui.length; + return true; +} |