diff options
Diffstat (limited to 'src/kchmsearchengine_impl.cpp')
-rw-r--r-- | src/kchmsearchengine_impl.cpp | 572 |
1 files changed, 572 insertions, 0 deletions
diff --git a/src/kchmsearchengine_impl.cpp b/src/kchmsearchengine_impl.cpp new file mode 100644 index 0000000..286832a --- /dev/null +++ b/src/kchmsearchengine_impl.cpp @@ -0,0 +1,572 @@ +/*************************************************************************** + * Copyright (C) 2004-2007 by Georgy Yunaev, [email protected] * + * Portions Copyright (C) 2000-2005 Trolltech AS. * + * Please do not use email address above for bug reports; see * + * the README file * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + + +#include <qfile.h> +#include <qdir.h> +#include <qstringlist.h> +#include <qdict.h> +#include <qapplication.h> + +#include <ctype.h> + +#include "kchmsearchengine_impl.h" +#include "kchmmainwindow.h" +#include "libchmfileimpl.h" + + +namespace QtAs { + +// Those characters are splitters (i.e. split the word), but added themselves into dictionary too. +// This makes the dictionary MUCH larger, but ensure that for the piece of "window->print" both +// search for "print" and "->print" will find it. +static const char SPLIT_CHARACTERS[] = "!()*&^%#@[]{}':;,.?/|/?<>\\-+=~`"; + +// Those characters are parts of word - for example, '_' is here, and search for _debug will find only _debug. +static const char WORD_CHARACTERS[] = "$_"; + + +int TermList::compareItems( QPtrCollection::Item i1, QPtrCollection::Item i2 ) +{ + if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency ) + return 0; + if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency ) + return -1; + return 1; +} + +QDataStream &operator>>( QDataStream &s, Document &l ) +{ + s >> l.docNumber; + s >> l.frequency; + return s; +} + +QDataStream &operator<<( QDataStream &s, const Document &l ) +{ + s << (Q_INT16)l.docNumber; + s << (Q_INT16)l.frequency; + return s; +} + +Index::Index( const QString &dp, const QString & ) + : QObject( 0, 0 ), dict( 8999 ), docPath( dp ) +{ + lastWindowClosed = false; + connect( qApp, SIGNAL( lastWindowClosed() ), + this, SLOT( setLastWinClosed() ) ); +} + +Index::Index( const QStringList &dl, const QString & ) + : QObject( 0, 0 ), dict( 20011 ) +{ + docList = dl; + lastWindowClosed = false; + connect( qApp, SIGNAL( lastWindowClosed() ), + this, SLOT( setLastWinClosed() ) ); +} + +void Index::setLastWinClosed() +{ + lastWindowClosed = true; +} + +void Index::setDictionaryFile( const QString &f ) +{ + dictFile = f; +} + +void Index::setDocListFile( const QString &f ) +{ + docListFile = f; +} + +void Index::setDocList( const QStringList &lst ) +{ + docList = lst; +} + +bool Index::makeIndex() +{ + if ( docList.isEmpty() ) + return false; + + QStringList::Iterator it = docList.begin(); + int steps = docList.count() / 100; + + if ( !steps ) + steps++; + + int prog = 0; + + for ( int i = 0; it != docList.end(); ++it, ++i ) + { + if ( lastWindowClosed ) + return false; + + parseDocument( *it, i ); + + if ( i%steps == 0 ) + { + prog++; + emit indexingProgress( prog ); + } + } + + return true; +} + + +void Index::insertInDict( const QString &str, int docNum ) +{ + Entry *e = 0; + if ( dict.count() ) + e = dict[ str ]; + + if ( e ) + { + if ( e->documents.first().docNumber != docNum ) + e->documents.prepend( Document( docNum, 1 ) ); + else + e->documents.first().frequency++; + } + else + { + dict.insert( str, new Entry( docNum ) ); + } +} + + +bool Index::parseDocumentToStringlist( const QString & filename, QStringList & tokenlist ) +{ + QString parsedbuf, parseentity; + QString text; + + if ( !::mainWindow->chmFile()->getFileContentAsString( &text, filename ) ) + { + qWarning( "Index::parseDocument: Could not retrieve the content at %s", filename.ascii() ); + return false; + } + + if ( text.isNull() ) + { + qWarning( "Index::parseDocument: Retrieved content for %s is empty", filename.ascii() ); + return false; + } + + m_charssplit = SPLIT_CHARACTERS; + m_charsword = WORD_CHARACTERS; + + tokenlist.clear(); + + // State machine states + enum state_t + { + STATE_OUTSIDE_TAGS, // outside HTML tags; parse text + STATE_IN_HTML_TAG, // inside HTML tags; wait for end tag + STATE_IN_QUOTES, // inside HTML tags and inside quotes; wait for end quote (in var QuoteChar) + STATE_IN_HTML_ENTITY, // inside HTML entity; parse the entity + }; + + state_t state = STATE_OUTSIDE_TAGS; + QChar QuoteChar; // used in STATE_IN_QUOTES + + for ( unsigned int j = 0; j < text.length(); j++ ) + { + QChar ch = text[j]; + + if ( (j % 20000) == 0 ) + qApp->processEvents( QEventLoop::ExcludeUserInput ); + + if ( state == STATE_IN_HTML_TAG ) + { + // We are inside HTML tag. + // Ignore everything until we see '>' (end of HTML tag) or quote char (quote start) + if ( ch == '"' || ch == '\'' ) + { + state = STATE_IN_QUOTES; + QuoteChar = ch; + } + else if ( ch == '>' ) + state = STATE_OUTSIDE_TAGS; + + continue; + } + else if ( state == STATE_IN_QUOTES ) + { + // We are inside quoted text inside HTML tag. + // Ignore everything until we see the quote character again + if ( ch == QuoteChar ) + state = STATE_IN_HTML_TAG; + + continue; + } + else if ( state == STATE_IN_HTML_ENTITY ) + { + // We are inside encoded HTML entity (like ). + // Collect to parsedbuf everything until we see ; + if ( ch.isLetterOrNumber() ) + { + // get next character of this entity + parseentity.append( ch ); + continue; + } + + // The entity ended + state = STATE_OUTSIDE_TAGS; + + // Some shitty HTML does not terminate entities correctly. Screw it. + if ( ch != ';' && ch != '<' ) + { + if ( parseentity.isEmpty() ) + { + // straight '&' symbol. Add and continue. + parsedbuf += "&"; + } + else + qWarning( "Index::parseDocument: incorrectly terminated HTML entity '&%s%c', ignoring", parseentity.ascii(), ch.latin1() ); + + j--; // parse this character again, but in different state + continue; + } + + // Don't we have a space? + if ( parseentity.lower() != "nbsp" ) + { + QString entity = ::mainWindow->chmFile()->impl()->decodeEntity( parseentity ); + + if ( entity.isNull() ) + { + // decodeEntity() already printed error message + //qWarning( "Index::parseDocument: failed to decode entity &%s;", parsedbuf.ascii() ); + continue; + } + + parsedbuf += entity; + continue; + } + else + ch = ' '; // We got a space, so treat it like it, and not add it to parsebuf + } + + // + // Now process STATE_OUTSIDE_TAGS + // + + // Check for start of HTML tag, and switch to STATE_IN_HTML_TAG if it is + if ( ch == '<' ) + { + state = STATE_IN_HTML_TAG; + goto tokenize_buf; + } + + // Check for start of HTML entity + if ( ch == '&' ) + { + state = STATE_IN_HTML_ENTITY; + parseentity = QString::null; + continue; + } + + // Replace quote by ' - quotes are used in search window to set the phrase + if ( ch == '"' ) + ch = '\''; + + // Ok, we have a valid character outside HTML tags, and probably some in buffer already. + // If it is char or letter, add it and continue + if ( ch.isLetterOrNumber() || m_charsword.find( ch ) != -1 ) + { + parsedbuf.append( ch ); + continue; + } + + // If it is a split char, add the word to the dictionary, and then add the char itself. + if ( m_charssplit.find( ch ) != -1 ) + { + if ( !parsedbuf.isEmpty() ) + tokenlist.push_back( parsedbuf.lower() ); + + tokenlist.push_back( ch.lower() ); + parsedbuf = QString::null; + continue; + } + +tokenize_buf: + // Just add the word; it is most likely a space or terminated by tokenizer. + if ( !parsedbuf.isEmpty() ) + { + tokenlist.push_back( parsedbuf.lower() ); + parsedbuf = QString::null; + } + } + + // Add the last word if still here - for broken htmls. + if ( !parsedbuf.isEmpty() ) + tokenlist.push_back( parsedbuf.lower() ); + + return true; +} + + +void Index::parseDocument( const QString &filename, int docNum ) +{ + QStringList terms; + + if ( !parseDocumentToStringlist( filename, terms ) ) + return; + + for ( QStringList::Iterator it = terms.begin(); it != terms.end(); ++it ) + insertInDict( *it, docNum ); +} + + +void Index::writeDict() +{ + QDictIterator<Entry> it( dict ); + QFile f( dictFile ); + + if ( !f.open( IO_WriteOnly ) ) + { + qWarning( "Index::writeDict: could not write dictionary file %s", dictFile.ascii() ); + return; + } + + QDataStream s( &f ); + s << (int) 1; // version + s << m_charssplit; + s << m_charsword; + + for( ; it.current(); ++it ) + { + Entry *e = it.current(); + s << it.currentKey(); + s << e->documents; + } + + f.close(); + writeDocumentList(); +} + +void Index::writeDocumentList() +{ + QFile f( docListFile ); + if ( !f.open( IO_WriteOnly ) ) + { + qWarning( "Index::writeDocumentList: could not write dictionary file %s", docListFile.ascii() ); + return; + } + QDataStream s( &f ); + s << docList; +} + +bool Index::readDict() +{ + QFile f( dictFile ); + if ( !f.open( IO_ReadOnly ) ) + return false; + + dict.clear(); + QDataStream s( &f ); + QString key; + int version; + QValueList<Document> docs; + + s >> version; + s >> m_charssplit; + s >> m_charsword; + + while ( !s.atEnd() ) + { + s >> key; + s >> docs; + dict.insert( key, new Entry( docs ) ); + } + + f.close(); + return dict.size() > 0 && readDocumentList(); +} + +bool Index::readDocumentList() +{ + QFile f( docListFile ); + if ( !f.open( IO_ReadOnly ) ) + return false; + QDataStream s( &f ); + s >> docList; + return true; +} + +QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords ) +{ + TermList termList; + + QStringList::ConstIterator it = terms.begin(); + for ( it = terms.begin(); it != terms.end(); ++it ) + { + Entry *e = 0; + + if ( dict[ *it ] ) + { + e = dict[ *it ]; + termList.append( new Term( *it, e->documents.count(), e->documents ) ); + } + else + { + return QStringList(); + } + } + + termList.sort(); + + Term *minTerm = termList.first(); + + if ( !termList.count() ) + return QStringList(); + + termList.removeFirst(); + + QValueList<Document> minDocs = minTerm->documents; + QValueList<Document>::iterator C; + QValueList<Document>::ConstIterator It; + Term *t = termList.first(); + + for ( ; t; t = termList.next() ) + { + QValueList<Document> docs = t->documents; + C = minDocs.begin(); + + while ( C != minDocs.end() ) + { + bool found = false; + + for ( It = docs.begin(); It != docs.end(); ++It ) + { + if ( (*C).docNumber == (*It).docNumber ) + { + (*C).frequency += (*It).frequency; + found = true; + break; + } + } + + if ( !found ) + C = minDocs.remove( C ); + else + ++C; + } + } + + QStringList results; + qHeapSort( minDocs ); + + if ( termSeq.isEmpty() ) + { + for ( C = minDocs.begin(); C != minDocs.end(); ++C ) + results << docList[ (int)(*C).docNumber ]; + + return results; + } + + QString fileName; + + for ( C = minDocs.begin(); C != minDocs.end(); ++C ) + { + fileName = docList[ (int)(*C).docNumber ]; + + if ( searchForPhrases( termSeq, seqWords, fileName ) ) + results << fileName; + } + + return results; +} + + +bool Index::searchForPhrases( const QStringList &phrases, const QStringList &words, const QString &filename ) +{ + QStringList parsed_document; + + if ( !parseDocumentToStringlist( filename, parsed_document ) ) + return false; + + miniDict.clear(); + + // Initialize the dictionary with the words in phrase(s) + for ( QStringList::ConstIterator cIt = words.begin(); cIt != words.end(); ++cIt ) + miniDict.insert( *cIt, new PosEntry( 0 ) ); + + // Fill the dictionary with the words from the document + unsigned int word_offset = 3; + for ( QStringList::ConstIterator it = parsed_document.begin(); it != parsed_document.end(); it++, word_offset++ ) + { + PosEntry * entry = miniDict[ *it ]; + + if ( entry ) + entry->positions.append( word_offset ); + } + + // Dump it +/* + QDictIterator<PosEntry> it( miniDict ); + for( ; it.current(); ++it ) + { + QString text( it.currentKey() ); + QValueList<uint> pos = miniDict[text]->positions; + for ( unsigned int i = 1; i < pos.size(); i++ ) + text += " " + QString::number( pos[i] ); + + qDebug( "%s", text.ascii()); + } +*/ + + QValueList<uint> first_word_positions; + + for ( QStringList::ConstIterator phrase_it = phrases.begin(); phrase_it != phrases.end(); phrase_it++ ) + { + QStringList phrasewords = QStringList::split( ' ', *phrase_it ); + first_word_positions = miniDict[ phrasewords[0] ]->positions; + + for ( unsigned int j = 1; j < phrasewords.count(); ++j ) + { + QValueList<uint> next_word_it = miniDict[ phrasewords[j] ]->positions; + QValueList<uint>::iterator dict_it = first_word_positions.begin(); + + while ( dict_it != first_word_positions.end() ) + { + if ( next_word_it.find( *dict_it + 1 ) != next_word_it.end() ) + { + (*dict_it)++; + ++dict_it; + } + else + dict_it = first_word_positions.remove( dict_it ); + } + } + } + + if ( first_word_positions.count() ) + return true; + + return false; +} + + +}; |