/********************************************************************** ** Copyright (C) 2000-2008 Trolltech ASA. All rights reserved. ** ** This file is part of the TQt Assistant. ** ** This file may be used under the terms of the GNU General ** Public License versions 2.0 or 3.0 as published by the Free ** Software Foundation and appearing in the files LICENSE.GPL2 ** and LICENSE.GPL3 included in the packaging of this file. ** Alternatively you may (at your option) use any later version ** of the GNU General Public License if such license has been ** publicly approved by Trolltech ASA (or its successors, if any) ** and the KDE Free TQt Foundation. ** ** Please review the following information to ensure GNU General ** Public Licensing requirements will be met: ** http://trolltech.com/products/qt/licenses/licensing/opensource/. ** If you are unsure which license is appropriate for your use, please ** review the following information: ** http://trolltech.com/products/qt/licenses/licensing/licensingoverview ** or contact the sales department at sales@trolltech.com. ** ** Licensees holding valid TQt Commercial licenses may use this file in ** accordance with the TQt Commercial License Agreement provided with ** the Software. ** ** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, ** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR ** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted ** herein. ** **********************************************************************/ #include "index.h" #include #include #include #include #include #include int TermList::compareItems( TQPtrCollection::Item i1, TQPtrCollection::Item i2 ) { if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency ) return 0; if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency ) return -1; return 1; } TQDataStream &operator>>( TQDataStream &s, Document &l ) { s >> l.docNumber; s >> l.frequency; return s; } TQDataStream &operator<<( TQDataStream &s, const Document &l ) { s << (TQ_INT16)l.docNumber; s << (TQ_INT16)l.frequency; return s; } Index::Index( const TQString &dp, const TQString &hp ) : TQObject( 0, 0 ), dict( 8999 ), docPath( dp ) { alreadyHaveDocList = FALSE; lastWindowClosed = FALSE; connect( tqApp, TQT_SIGNAL( lastWindowClosed() ), this, TQT_SLOT( setLastWinClosed() ) ); } Index::Index( const TQStringList &dl, const TQString &hp ) : TQObject( 0, 0 ), dict( 8999 ) { docList = dl; alreadyHaveDocList = TRUE; lastWindowClosed = FALSE; connect( tqApp, TQT_SIGNAL( lastWindowClosed() ), this, TQT_SLOT( setLastWinClosed() ) ); } void Index::setLastWinClosed() { lastWindowClosed = TRUE; } void Index::setDictionaryFile( const TQString &f ) { dictFile = f; } void Index::setDocListFile( const TQString &f ) { docListFile = f; } void Index::setDocList( const TQStringList &lst ) { docList = lst; } int Index::makeIndex() { if ( !alreadyHaveDocList ) setupDocumentList(); if ( docList.isEmpty() ) return 1; TQStringList::Iterator it = docList.begin(); int steps = docList.count() / 100; if ( !steps ) steps++; int prog = 0; for ( int i = 0; it != docList.end(); ++it, ++i ) { if ( lastWindowClosed ) { return -1; } parseDocument( *it, i ); if ( i%steps == 0 ) { prog++; emit indexingProgress( prog ); } } return 0; } void Index::setupDocumentList() { TQDir d( docPath ); TQStringList lst = d.entryList( "*.html" ); TQStringList::ConstIterator it = lst.begin(); for ( ; it != lst.end(); ++it ) docList.append( docPath + "/" + *it ); } void Index::insertInDict( const TQString &str, int docNum ) { if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 ) return; Entry *e = 0; if ( dict.count() ) e = dict[ str ]; if ( e ) { if ( e->documents.first().docNumber != docNum ) e->documents.prepend( Document( docNum, 1 ) ); else e->documents.first().frequency++; } else { dict.insert( str, new Entry( docNum ) ); } } void Index::parseDocument( const TQString &filename, int docNum ) { TQFile file( filename ); if ( !file.open( IO_ReadOnly ) ) { qWarning( "can not open file " + filename ); return; } TQTextStream s( &file ); TQString text = s.read(); if (text.isNull()) return; bool valid = TRUE; const TQChar *buf = text.tqunicode(); TQChar str[64]; TQChar c = buf[0]; int j = 0; int i = 0; while ( (uint)j < text.length() ) { if ( c == '<' || c == '&' ) { valid = FALSE; if ( i > 1 ) insertInDict( TQString(str,i), docNum ); i = 0; c = buf[++j]; continue; } if ( ( c == '>' || c == ';' ) && !valid ) { valid = TRUE; c = buf[++j]; continue; } if ( !valid ) { c = buf[++j]; continue; } if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { str[i] = c.lower(); ++i; } else { if ( i > 1 ) insertInDict( TQString(str,i), docNum ); i = 0; } c = buf[++j]; } if ( i > 1 ) insertInDict( TQString(str,i), docNum ); file.close(); } void Index::writeDict() { TQDictIterator it( dict ); TQFile f( dictFile ); if ( !f.open( IO_WriteOnly ) ) return; TQDataStream s( &f ); for( ; it.current(); ++it ) { Entry *e = it.current(); s << it.currentKey(); s << e->documents; } f.close(); writeDocumentList(); } void Index::writeDocumentList() { TQFile f( docListFile ); if ( !f.open( IO_WriteOnly ) ) return; TQDataStream s( &f ); s << docList; } void Index::readDict() { TQFile f( dictFile ); if ( !f.open( IO_ReadOnly ) ) return; dict.clear(); TQDataStream s( &f ); TQString key; TQValueList docs; while ( !s.atEnd() ) { s >> key; s >> docs; dict.insert( key, new Entry( docs ) ); } f.close(); readDocumentList(); } void Index::readDocumentList() { TQFile f( docListFile ); if ( !f.open( IO_ReadOnly ) ) return; TQDataStream s( &f ); s >> docList; } TQStringList Index::query( const TQStringList &terms, const TQStringList &termSeq, const TQStringList &seqWords ) { TermList termList; TQStringList::ConstIterator it = terms.begin(); for ( it = terms.begin(); it != terms.end(); ++it ) { Entry *e = 0; if ( (*it).tqcontains( '*' ) ) { TQValueList wcts = setupDummyTerm( getWildcardTerms( *it ) ); termList.append( new Term( "dummy", wcts.count(), wcts ) ); } else if ( dict[ *it ] ) { e = dict[ *it ]; termList.append( new Term( *it, e->documents.count(), e->documents ) ); } else { return TQStringList(); } } termList.sort(); Term *minTerm = termList.first(); if ( !termList.count() ) return TQStringList(); termList.removeFirst(); TQValueList minDocs = minTerm->documents; TQValueList::iterator C; TQValueList::ConstIterator It; Term *t = termList.first(); for ( ; t; t = termList.next() ) { TQValueList docs = t->documents; C = minDocs.begin(); while ( C != minDocs.end() ) { bool found = FALSE; for ( It = docs.begin(); It != docs.end(); ++It ) { if ( (*C).docNumber == (*It).docNumber ) { (*C).frequency += (*It).frequency; found = TRUE; break; } } if ( !found ) C = minDocs.remove( C ); else ++C; } } TQStringList results; qHeapSort( minDocs ); if ( termSeq.isEmpty() ) { for ( C = minDocs.begin(); C != minDocs.end(); ++C ) results << docList[ (int)(*C).docNumber ]; return results; } TQString fileName; for ( C = minDocs.begin(); C != minDocs.end(); ++C ) { fileName = docList[ (int)(*C).docNumber ]; if ( searchForPattern( termSeq, seqWords, fileName ) ) results << fileName; } return results; } TQString Index::getDocumentTitle( const TQString &fileName ) { TQFile file( fileName ); if ( !file.open( IO_ReadOnly ) ) { qWarning( "cannot open file " + fileName ); return fileName; } TQTextStream s( &file ); TQString text = s.read(); int start = text.tqfind( "", 0, FALSE ) + 7; int end = text.tqfind( "", 0, FALSE ); TQString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) ); return title; } TQStringList Index::getWildcardTerms( const TQString &term ) { TQStringList lst; TQStringList terms = split( term ); TQValueList::iterator iter; TQDictIterator it( dict ); for( ; it.current(); ++it ) { int index = 0; bool found = FALSE; TQString text( it.currentKey() ); for ( iter = terms.begin(); iter != terms.end(); ++iter ) { if ( *iter == "*" ) { found = TRUE; continue; } if ( iter == terms.begin() && (*iter)[0] != text[0] ) { found = FALSE; break; } index = text.tqfind( *iter, index ); if ( *iter == terms.last() && index != (int)text.length()-1 ) { index = text.tqfindRev( *iter ); if ( index != (int)text.length() - (int)(*iter).length() ) { found = FALSE; break; } } if ( index != -1 ) { found = TRUE; index += (*iter).length(); continue; } else { found = FALSE; break; } } if ( found ) lst << text; } return lst; } TQStringList Index::split( const TQString &str ) { TQStringList lst; int j = 0; int i = str.tqfind( '*', j ); while ( i != -1 ) { if ( i > j && i <= (int)str.length() ) { lst << str.mid( j, i - j ); lst << "*"; } j = i + 1; i = str.tqfind( '*', j ); } int l = str.length() - 1; if ( str.mid( j, l - j + 1 ).length() > 0 ) lst << str.mid( j, l - j + 1 ); return lst; } TQValueList Index::setupDummyTerm( const TQStringList &terms ) { TermList termList; TQStringList::ConstIterator it = terms.begin(); for ( ; it != terms.end(); ++it ) { Entry *e = 0; if ( dict[ *it ] ) { e = dict[ *it ]; termList.append( new Term( *it, e->documents.count(), e->documents ) ); } } termList.sort(); TQValueList maxList; if ( !termList.count() ) return maxList; maxList = termList.last()->documents; termList.removeLast(); TQValueList::iterator docIt; Term *t = termList.first(); while ( t ) { TQValueList docs = t->documents; for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) { if ( maxList.tqfindIndex( *docIt ) == -1 ) maxList.append( *docIt ); } t = termList.next(); } return maxList; } void Index::buildMiniDict( const TQString &str ) { if ( miniDict[ str ] ) miniDict[ str ]->positions.append( wordNum ); ++wordNum; } bool Index::searchForPattern( const TQStringList &patterns, const TQStringList &words, const TQString &fileName ) { TQFile file( fileName ); if ( !file.open( IO_ReadOnly ) ) { qWarning( "cannot open file " + fileName ); return FALSE; } wordNum = 3; miniDict.clear(); TQStringList::ConstIterator cIt = words.begin(); for ( ; cIt != words.end(); ++cIt ) miniDict.insert( *cIt, new PosEntry( 0 ) ); TQTextStream s( &file ); TQString text = s.read(); bool valid = TRUE; const TQChar *buf = text.tqunicode(); TQChar str[64]; TQChar c = buf[0]; int j = 0; int i = 0; while ( (uint)j < text.length() ) { if ( c == '<' || c == '&' ) { valid = FALSE; if ( i > 1 ) buildMiniDict( TQString(str,i) ); i = 0; c = buf[++j]; continue; } if ( ( c == '>' || c == ';' ) && !valid ) { valid = TRUE; c = buf[++j]; continue; } if ( !valid ) { c = buf[++j]; continue; } if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { str[i] = c.lower(); ++i; } else { if ( i > 1 ) buildMiniDict( TQString(str,i) ); i = 0; } c = buf[++j]; } if ( i > 1 ) buildMiniDict( TQString(str,i) ); file.close(); TQStringList::ConstIterator patIt = patterns.begin(); TQStringList wordLst; TQValueList a, b; TQValueList::iterator aIt; for ( ; patIt != patterns.end(); ++patIt ) { wordLst = TQStringList::split( ' ', *patIt ); a = miniDict[ wordLst[0] ]->positions; for ( int j = 1; j < (int)wordLst.count(); ++j ) { b = miniDict[ wordLst[j] ]->positions; aIt = a.begin(); while ( aIt != a.end() ) { if ( b.tqfind( *aIt + 1 ) != b.end() ) { (*aIt)++; ++aIt; } else { aIt = a.remove( aIt ); } } } } if ( a.count() ) return TRUE; return FALSE; }