summaryrefslogtreecommitdiffstats
path: root/src/kchmsearchengine_impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/kchmsearchengine_impl.cpp')
-rw-r--r--src/kchmsearchengine_impl.cpp572
1 files changed, 572 insertions, 0 deletions
diff --git a/src/kchmsearchengine_impl.cpp b/src/kchmsearchengine_impl.cpp
new file mode 100644
index 0000000..286832a
--- /dev/null
+++ b/src/kchmsearchengine_impl.cpp
@@ -0,0 +1,572 @@
+/***************************************************************************
+ * Copyright (C) 2004-2007 by Georgy Yunaev, [email protected] *
+ * Portions Copyright (C) 2000-2005 Trolltech AS. *
+ * Please do not use email address above for bug reports; see *
+ * the README file *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+
+#include <qfile.h>
+#include <qdir.h>
+#include <qstringlist.h>
+#include <qdict.h>
+#include <qapplication.h>
+
+#include <ctype.h>
+
+#include "kchmsearchengine_impl.h"
+#include "kchmmainwindow.h"
+#include "libchmfileimpl.h"
+
+
+namespace QtAs {
+
+// Those characters are splitters (i.e. split the word), but added themselves into dictionary too.
+// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both
+// search for "print" and "->print" will find it.
+static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`";
+
+// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug.
+static const char WORD_CHARACTERS[] = "$_";
+
+
+int TermList::compareItems( QPtrCollection::Item i1, QPtrCollection::Item i2 )
+{
+ if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency )
+ return 0;
+ if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency )
+ return -1;
+ return 1;
+}
+
+QDataStream &operator>>( QDataStream &s, Document &l )
+{
+ s >> l.docNumber;
+ s >> l.frequency;
+ return s;
+}
+
+QDataStream &operator<<( QDataStream &s, const Document &l )
+{
+ s << (Q_INT16)l.docNumber;
+ s << (Q_INT16)l.frequency;
+ return s;
+}
+
+Index::Index( const QString &dp, const QString & )
+ : QObject( 0, 0 ), dict( 8999 ), docPath( dp )
+{
+ lastWindowClosed = false;
+ connect( qApp, SIGNAL( lastWindowClosed() ),
+ this, SLOT( setLastWinClosed() ) );
+}
+
+Index::Index( const QStringList &dl, const QString & )
+ : QObject( 0, 0 ), dict( 20011 )
+{
+ docList = dl;
+ lastWindowClosed = false;
+ connect( qApp, SIGNAL( lastWindowClosed() ),
+ this, SLOT( setLastWinClosed() ) );
+}
+
+void Index::setLastWinClosed()
+{
+ lastWindowClosed = true;
+}
+
+void Index::setDictionaryFile( const QString &f )
+{
+ dictFile = f;
+}
+
+void Index::setDocListFile( const QString &f )
+{
+ docListFile = f;
+}
+
+void Index::setDocList( const QStringList &lst )
+{
+ docList = lst;
+}
+
+bool Index::makeIndex()
+{
+ if ( docList.isEmpty() )
+ return false;
+
+ QStringList::Iterator it = docList.begin();
+ int steps = docList.count() / 100;
+
+ if ( !steps )
+ steps++;
+
+ int prog = 0;
+
+ for ( int i = 0; it != docList.end(); ++it, ++i )
+ {
+ if ( lastWindowClosed )
+ return false;
+
+ parseDocument( *it, i );
+
+ if ( i%steps == 0 )
+ {
+ prog++;
+ emit indexingProgress( prog );
+ }
+ }
+
+ return true;
+}
+
+
+void Index::insertInDict( const QString &str, int docNum )
+{
+ Entry *e = 0;
+ if ( dict.count() )
+ e = dict[ str ];
+
+ if ( e )
+ {
+ if ( e->documents.first().docNumber != docNum )
+ e->documents.prepend( Document( docNum, 1 ) );
+ else
+ e->documents.first().frequency++;
+ }
+ else
+ {
+ dict.insert( str, new Entry( docNum ) );
+ }
+}
+
+
+bool Index::parseDocumentToStringlist( const QString & filename, QStringList & tokenlist )
+{
+ QString parsedbuf, parseentity;
+ QString text;
+
+ if ( !::mainWindow->chmFile()->getFileContentAsString( &text, filename ) )
+ {
+ qWarning( "Index::parseDocument: Could not retrieve the content at %s", filename.ascii() );
+ return false;
+ }
+
+ if ( text.isNull() )
+ {
+ qWarning( "Index::parseDocument: Retrieved content for %s is empty", filename.ascii() );
+ return false;
+ }
+
+ m_charssplit = SPLIT_CHARACTERS;
+ m_charsword = WORD_CHARACTERS;
+
+ tokenlist.clear();
+
+ // State machine states
+ enum state_t
+ {
+ STATE_OUTSIDE_TAGS, // outside HTML tags; parse text
+ STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag
+ STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar)
+ STATE_IN_HTML_ENTITY, // inside HTML entity; parse the entity
+ };
+
+ state_t state = STATE_OUTSIDE_TAGS;
+ QChar QuoteChar; // used in STATE_IN_QUOTES
+
+ for ( unsigned int j = 0; j < text.length(); j++ )
+ {
+ QChar ch = text[j];
+
+ if ( (j % 20000) == 0 )
+ qApp->processEvents( QEventLoop::ExcludeUserInput );
+
+ if ( state == STATE_IN_HTML_TAG )
+ {
+ // We are inside HTML tag.
+ // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start)
+ if ( ch == '"' || ch == '\'' )
+ {
+ state = STATE_IN_QUOTES;
+ QuoteChar = ch;
+ }
+ else if ( ch == '>' )
+ state = STATE_OUTSIDE_TAGS;
+
+ continue;
+ }
+ else if ( state == STATE_IN_QUOTES )
+ {
+ // We are inside quoted text inside HTML tag.
+ // Ignore everything until we see the quote character again
+ if ( ch == QuoteChar )
+ state = STATE_IN_HTML_TAG;
+
+ continue;
+ }
+ else if ( state == STATE_IN_HTML_ENTITY )
+ {
+ // We are inside encoded HTML entity (like &nbsp;).
+ // Collect to parsedbuf everything until we see ;
+ if ( ch.isLetterOrNumber() )
+ {
+ // get next character of this entity
+ parseentity.append( ch );
+ continue;
+ }
+
+ // The entity ended
+ state = STATE_OUTSIDE_TAGS;
+
+ // Some shitty HTML does not terminate entities correctly. Screw it.
+ if ( ch != ';' && ch != '<' )
+ {
+ if ( parseentity.isEmpty() )
+ {
+ // straight '&' symbol. Add and continue.
+ parsedbuf += "&";
+ }
+ else
+ qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", parseentity.ascii(), ch.latin1() );
+
+ j--; // parse this character again, but in different state
+ continue;
+ }
+
+ // Don't we have a space?
+ if ( parseentity.lower() != "nbsp" )
+ {
+ QString entity = ::mainWindow->chmFile()->impl()->decodeEntity( parseentity );
+
+ if ( entity.isNull() )
+ {
+ // decodeEntity() already printed error message
+ //qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() );
+ continue;
+ }
+
+ parsedbuf += entity;
+ continue;
+ }
+ else
+ ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf
+ }
+
+ //
+ // Now process STATE_OUTSIDE_TAGS
+ //
+
+ // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is
+ if ( ch == '<' )
+ {
+ state = STATE_IN_HTML_TAG;
+ goto tokenize_buf;
+ }
+
+ // Check for start of HTML entity
+ if ( ch == '&' )
+ {
+ state = STATE_IN_HTML_ENTITY;
+ parseentity = QString::null;
+ continue;
+ }
+
+ // Replace quote by ' - quotes are used in search window to set the phrase
+ if ( ch == '"' )
+ ch = '\'';
+
+ // Ok, we have a valid character outside HTML tags, and probably some in buffer already.
+ // If it is char or letter, add it and continue
+ if ( ch.isLetterOrNumber() || m_charsword.find( ch ) != -1 )
+ {
+ parsedbuf.append( ch );
+ continue;
+ }
+
+ // If it is a split char, add the word to the dictionary, and then add the char itself.
+ if ( m_charssplit.find( ch ) != -1 )
+ {
+ if ( !parsedbuf.isEmpty() )
+ tokenlist.push_back( parsedbuf.lower() );
+
+ tokenlist.push_back( ch.lower() );
+ parsedbuf = QString::null;
+ continue;
+ }
+
+tokenize_buf:
+ // Just add the word; it is most likely a space or terminated by tokenizer.
+ if ( !parsedbuf.isEmpty() )
+ {
+ tokenlist.push_back( parsedbuf.lower() );
+ parsedbuf = QString::null;
+ }
+ }
+
+ // Add the last word if still here - for broken htmls.
+ if ( !parsedbuf.isEmpty() )
+ tokenlist.push_back( parsedbuf.lower() );
+
+ return true;
+}
+
+
+void Index::parseDocument( const QString &filename, int docNum )
+{
+ QStringList terms;
+
+ if ( !parseDocumentToStringlist( filename, terms ) )
+ return;
+
+ for ( QStringList::Iterator it = terms.begin(); it != terms.end(); ++it )
+ insertInDict( *it, docNum );
+}
+
+
+void Index::writeDict()
+{
+ QDictIterator<Entry> it( dict );
+ QFile f( dictFile );
+
+ if ( !f.open( IO_WriteOnly ) )
+ {
+ qWarning( "Index::writeDict: could not write dictionary file %s", dictFile.ascii() );
+ return;
+ }
+
+ QDataStream s( &f );
+ s << (int) 1; // version
+ s << m_charssplit;
+ s << m_charsword;
+
+ for( ; it.current(); ++it )
+ {
+ Entry *e = it.current();
+ s << it.currentKey();
+ s << e->documents;
+ }
+
+ f.close();
+ writeDocumentList();
+}
+
+void Index::writeDocumentList()
+{
+ QFile f( docListFile );
+ if ( !f.open( IO_WriteOnly ) )
+ {
+ qWarning( "Index::writeDocumentList: could not write dictionary file %s", docListFile.ascii() );
+ return;
+ }
+ QDataStream s( &f );
+ s << docList;
+}
+
+bool Index::readDict()
+{
+ QFile f( dictFile );
+ if ( !f.open( IO_ReadOnly ) )
+ return false;
+
+ dict.clear();
+ QDataStream s( &f );
+ QString key;
+ int version;
+ QValueList<Document> docs;
+
+ s >> version;
+ s >> m_charssplit;
+ s >> m_charsword;
+
+ while ( !s.atEnd() )
+ {
+ s >> key;
+ s >> docs;
+ dict.insert( key, new Entry( docs ) );
+ }
+
+ f.close();
+ return dict.size() > 0 && readDocumentList();
+}
+
+bool Index::readDocumentList()
+{
+ QFile f( docListFile );
+ if ( !f.open( IO_ReadOnly ) )
+ return false;
+ QDataStream s( &f );
+ s >> docList;
+ return true;
+}
+
+QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
+{
+ TermList termList;
+
+ QStringList::ConstIterator it = terms.begin();
+ for ( it = terms.begin(); it != terms.end(); ++it )
+ {
+ Entry *e = 0;
+
+ if ( dict[ *it ] )
+ {
+ e = dict[ *it ];
+ termList.append( new Term( *it, e->documents.count(), e->documents ) );
+ }
+ else
+ {
+ return QStringList();
+ }
+ }
+
+ termList.sort();
+
+ Term *minTerm = termList.first();
+
+ if ( !termList.count() )
+ return QStringList();
+
+ termList.removeFirst();
+
+ QValueList<Document> minDocs = minTerm->documents;
+ QValueList<Document>::iterator C;
+ QValueList<Document>::ConstIterator It;
+ Term *t = termList.first();
+
+ for ( ; t; t = termList.next() )
+ {
+ QValueList<Document> docs = t->documents;
+ C = minDocs.begin();
+
+ while ( C != minDocs.end() )
+ {
+ bool found = false;
+
+ for ( It = docs.begin(); It != docs.end(); ++It )
+ {
+ if ( (*C).docNumber == (*It).docNumber )
+ {
+ (*C).frequency += (*It).frequency;
+ found = true;
+ break;
+ }
+ }
+
+ if ( !found )
+ C = minDocs.remove( C );
+ else
+ ++C;
+ }
+ }
+
+ QStringList results;
+ qHeapSort( minDocs );
+
+ if ( termSeq.isEmpty() )
+ {
+ for ( C = minDocs.begin(); C != minDocs.end(); ++C )
+ results << docList[ (int)(*C).docNumber ];
+
+ return results;
+ }
+
+ QString fileName;
+
+ for ( C = minDocs.begin(); C != minDocs.end(); ++C )
+ {
+ fileName = docList[ (int)(*C).docNumber ];
+
+ if ( searchForPhrases( termSeq, seqWords, fileName ) )
+ results << fileName;
+ }
+
+ return results;
+}
+
+
+bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QString &filename )
+{
+ QStringList parsed_document;
+
+ if ( !parseDocumentToStringlist( filename, parsed_document ) )
+ return false;
+
+ miniDict.clear();
+
+ // Initialize the dictionary with the words in phrase(s)
+ for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt )
+ miniDict.insert( *cIt, new PosEntry( 0 ) );
+
+ // Fill the dictionary with the words from the document
+ unsigned int word_offset = 3;
+ for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ )
+ {
+ PosEntry * entry = miniDict[ *it ];
+
+ if ( entry )
+ entry->positions.append( word_offset );
+ }
+
+ // Dump it
+/*
+ QDictIterator<PosEntry> it( miniDict );
+ for( ; it.current(); ++it )
+ {
+ QString text( it.currentKey() );
+ QValueList<uint> pos = miniDict[text]->positions;
+ for ( unsigned int i = 1; i < pos.size(); i++ )
+ text += " " + QString::number( pos[i] );
+
+ qDebug( "%s", text.ascii());
+ }
+*/
+
+ QValueList<uint> first_word_positions;
+
+ for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ )
+ {
+ QStringList phrasewords = QStringList::split( ' ', *phrase_it );
+ first_word_positions = miniDict[ phrasewords[0] ]->positions;
+
+ for ( unsigned int j = 1; j < phrasewords.count(); ++j )
+ {
+ QValueList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions;
+ QValueList<uint>::iterator dict_it = first_word_positions.begin();
+
+ while ( dict_it != first_word_positions.end() )
+ {
+ if ( next_word_it.find( *dict_it + 1 ) != next_word_it.end() )
+ {
+ (*dict_it)++;
+ ++dict_it;
+ }
+ else
+ dict_it = first_word_positions.remove( dict_it );
+ }
+ }
+ }
+
+ if ( first_word_positions.count() )
+ return true;
+
+ return false;
+}
+
+
+};