#include "index.h" #include "kvi_file.h" #include #include #include "kvi_pointerhashtable.h" #include #include #include int kvi_compare(const Term * p1,const Term * p2) { if(p1->frequency == p2->frequency) return 0; if(p1->frequency < p2->frequency) return -1; return 1; } TQDataStream &operator>>( TQDataStream &s, Document &l ) { s >> l.docNumber; s >> l.frequency; return s; } TQDataStream &operator<<( TQDataStream &s, const Document &l ) { s << (TQ_INT16)l.docNumber; s << (TQ_INT16)l.frequency; return s; } Index::Index( const TQString &dp, const TQString &hp ) : TQObject( 0, 0 ), dict( 8999 ), docPath( dp ) { alreadyHaveDocList = FALSE; lastWindowClosed = FALSE; connect( tqApp, TQT_SIGNAL( lastWindowClosed() ), this, TQT_SLOT( setLastWinClosed() ) ); } Index::Index( const TQStringList &dl, const TQString &hp ) : TQObject( 0, 0 ), dict( 8999 ) { docList = dl; alreadyHaveDocList = TRUE; lastWindowClosed = FALSE; connect( tqApp, TQT_SIGNAL( lastWindowClosed() ), this, TQT_SLOT( setLastWinClosed() ) ); } void Index::setLastWinClosed() { lastWindowClosed = TRUE; } void Index::setDictionaryFile( const TQString &f ) { dictFile = f; } void Index::setDocListFile( const TQString &f ) { docListFile = f; } int Index::makeIndex() { if ( !alreadyHaveDocList ) setupDocumentList(); if ( docList.isEmpty() ) return 1; dict.clear(); TQStringList::Iterator it = docList.begin(); int steps = docList.count() / 100; if ( !steps ) steps++; int prog = 0; for ( int i = 0; it != docList.end(); ++it, ++i ) { if ( lastWindowClosed ) { return -1; } parseDocument( *it, i ); if ( i%steps == 0 ) { prog++; emit indexingProgress( prog ); } } return 0; } void Index::setupDocumentList() { docList.clear(); titleList.clear(); TQDir d( docPath ); TQString szCur; TQStringList lst = d.entryList( "*.html" ); TQStringList::ConstIterator it = lst.begin(); for ( ; it != lst.end(); ++it ) { szCur=docPath + "/" + *it; docList.append( szCur ); titleList.append(getDocumentTitle( szCur )); } } void Index::insertInDict( const TQString &str, int docNum ) { if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 ) return; Entry *e = 0; if ( dict.count() ) e = dict[ str ]; if ( e ) { if ( e->documents.first().docNumber != docNum ) e->documents.prepend( Document( docNum, 1 ) ); else e->documents.first().frequency++; } else { dict.insert( str, new Entry( docNum ) ); } } void Index::parseDocument( const TQString &filename, int docNum ) { KviFile file( filename ); if ( !file.openForReading() ) { tqWarning( "can not open file %s", filename.ascii() ); return; } TQTextStream s( &file ); TQString text = s.read(); if (text.isNull()) return; bool valid = TRUE; const TQChar *buf = text.unicode(); TQChar str[64]; TQChar c = buf[0]; int j = 0; int i = 0; while ( (uint)j < text.length() ) { if ( c == '<' || c == '&' ) { valid = FALSE; if ( i > 1 ) insertInDict( TQString(str,i), docNum ); i = 0; c = buf[++j]; continue; } if ( ( c == '>' || c == ';' ) && !valid ) { valid = TRUE; c = buf[++j]; continue; } if ( !valid ) { c = buf[++j]; continue; } if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { str[i] = c.lower(); ++i; } else { if ( i > 1 ) insertInDict( TQString(str,i), docNum ); i = 0; } c = buf[++j]; } if ( i > 1 ) insertInDict( TQString(str,i), docNum ); file.close(); } void Index::writeDict() { KviPointerHashTableIterator it( dict ); KviFile f( dictFile ); if ( !f.openForWriting() ) return; TQDataStream s( &f ); for( ; it.current(); ++it ) { Entry *e = it.current(); s << it.currentKey(); s << e->documents; } f.close(); writeDocumentList(); } void Index::writeDocumentList() { KviFile f( docListFile ); if ( !f.openForWriting() ) return; TQTextStream s( &f ); TQString docs = docList.join("[#item#]"); s << docs; KviFile f1( docListFile+".titles" ); if ( !f1.openForWriting() ) return; TQTextStream s1( &f1 ); docs = titleList.join("[#item#]"); s1 << docs; } void Index::readDict() { KviFile f( dictFile ); if ( !f.openForReading() ) return; dict.clear(); TQDataStream s( &f ); TQString key; KviValueList docs; while ( !s.atEnd() ) { s >> key; s >> docs; dict.insert( key, new Entry( docs ) ); } f.close(); readDocumentList(); } void Index::readDocumentList() { //reading docs KviFile f( docListFile ); if ( !f.openForReading() ) return; TQTextStream s( &f ); docList = TQStringList::split("[#item#]",s.read()); //reading titles KviFile f1( docListFile+".titles" ); if ( !f1.openForReading() ) return; TQTextStream s1( &f1 ); titleList = TQStringList::split("[#item#]",s1.read()); // tqDebug(titleList); } TQStringList Index::query( const TQStringList &terms, const TQStringList &termSeq, const TQStringList &seqWords ) { TermList termList; TQStringList::ConstIterator it = terms.begin(); for ( it = terms.begin(); it != terms.end(); ++it ) { Entry *e = 0; if ( (*it).contains( '*' ) ) { KviValueList wcts = setupDummyTerm( getWildcardTerms( *it ) ); termList.append( new Term( "dummy", wcts.count(), wcts ) ); } else if ( dict[ *it ] ) { e = dict[ *it ]; termList.append( new Term( *it, e->documents.count(), e->documents ) ); } else { return TQStringList(); } } termList.sort(); Term *minTerm = termList.first(); if ( !termList.count() ) return TQStringList(); termList.removeFirst(); KviValueList minDocs = minTerm->documents; KviValueList::iterator C; KviValueList::ConstIterator It; Term *t = termList.first(); for ( ; t; t = termList.next() ) { KviValueList docs = t->documents; C = minDocs.begin(); while ( C != minDocs.end() ) { bool found = FALSE; for ( It = docs.begin(); It != docs.end(); ++It ) { if ( (*C).docNumber == (*It).docNumber ) { (*C).frequency += (*It).frequency; found = TRUE; break; } } if ( !found ) C = minDocs.remove( C ); else ++C; } } TQStringList results; qHeapSort( minDocs ); if ( termSeq.isEmpty() ) { for ( C = minDocs.begin(); C != minDocs.end(); ++C ) results << docList[ (int)(*C).docNumber ]; return results; } TQString fileName; for ( C = minDocs.begin(); C != minDocs.end(); ++C ) { fileName = docList[ (int)(*C).docNumber ]; if ( searchForPattern( termSeq, seqWords, fileName ) ) results << fileName; } return results; } TQString Index::getDocumentTitle( const TQString &fileName ) { KviFile file( fileName ); if ( !file.openForReading() ) { tqWarning( "cannot open file %s", fileName.ascii() ); return fileName; } TQTextStream s( &file ); TQString text = s.read(); int start = text.find( "", 0, FALSE ) + 7; int end = text.find( "", 0, FALSE ); TQString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) ); return title; } TQStringList Index::getWildcardTerms( const TQString &term ) { TQStringList lst; TQStringList terms = split( term ); KviValueList::iterator iter; KviPointerHashTableIterator it( dict ); for( ; it.current(); ++it ) { int index = 0; bool found = FALSE; TQString text( it.currentKey() ); for ( iter = terms.begin(); iter != terms.end(); ++iter ) { if ( *iter == "*" ) { found = TRUE; continue; } if ( iter == terms.begin() && (*iter)[0] != text[0] ) { found = FALSE; break; } index = text.find( *iter, index ); if ( *iter == terms.last() && index != (int)text.length()-1 ) { index = text.findRev( *iter ); if ( index != (int)text.length() - (int)(*iter).length() ) { found = FALSE; break; } } if ( index != -1 ) { found = TRUE; index += (*iter).length(); continue; } else { found = FALSE; break; } } if ( found ) lst << text; } return lst; } TQStringList Index::split( const TQString &str ) { TQStringList lst; int j = 0; int i = str.find( '*', j ); while ( i != -1 ) { if ( i > j && i <= (int)str.length() ) { lst << str.mid( j, i - j ); lst << "*"; } j = i + 1; i = str.find( '*', j ); } int l = str.length() - 1; if ( str.mid( j, l - j + 1 ).length() > 0 ) lst << str.mid( j, l - j + 1 ); return lst; } KviValueList Index::setupDummyTerm( const TQStringList &terms ) { TermList termList; TQStringList::ConstIterator it = terms.begin(); for ( ; it != terms.end(); ++it ) { Entry *e = 0; if ( dict[ *it ] ) { e = dict[ *it ]; termList.append( new Term( *it, e->documents.count(), e->documents ) ); } } termList.sort(); KviValueList maxList; if ( !termList.count() ) return maxList; maxList = termList.last()->documents; termList.removeLast(); KviValueList::iterator docIt; Term *t = termList.first(); while ( t ) { KviValueList docs = t->documents; for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) { if ( maxList.findIndex( *docIt ) == -1 ) maxList.append( *docIt ); } t = termList.next(); } return maxList; } void Index::buildMiniDict( const TQString &str ) { if ( miniDict[ str ] ) miniDict[ str ]->positions.append( wordNum ); ++wordNum; } bool Index::searchForPattern( const TQStringList &patterns, const TQStringList &words, const TQString &fileName ) { KviFile file( fileName ); if ( !file.openForReading() ) { tqWarning( "cannot open file %s", fileName.ascii() ); return FALSE; } wordNum = 3; miniDict.clear(); TQStringList::ConstIterator cIt = words.begin(); for ( ; cIt != words.end(); ++cIt ) miniDict.insert( *cIt, new PosEntry( 0 ) ); TQTextStream s( &file ); TQString text = s.read(); bool valid = TRUE; const TQChar *buf = text.unicode(); TQChar str[64]; TQChar c = buf[0]; int j = 0; int i = 0; while ( (uint)j < text.length() ) { if ( c == '<' || c == '&' ) { valid = FALSE; if ( i > 1 ) buildMiniDict( TQString(str,i) ); i = 0; c = buf[++j]; continue; } if ( ( c == '>' || c == ';' ) && !valid ) { valid = TRUE; c = buf[++j]; continue; } if ( !valid ) { c = buf[++j]; continue; } if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { str[i] = c.lower(); ++i; } else { if ( i > 1 ) buildMiniDict( TQString(str,i) ); i = 0; } c = buf[++j]; } if ( i > 1 ) buildMiniDict( TQString(str,i) ); file.close(); TQStringList::ConstIterator patIt = patterns.begin(); TQStringList wordLst; KviValueList a, b; KviValueList::iterator aIt; for ( ; patIt != patterns.end(); ++patIt ) { wordLst = TQStringList::split( ' ', *patIt ); a = miniDict[ wordLst[0] ]->positions; for ( int j = 1; j < (int)wordLst.count(); ++j ) { b = miniDict[ wordLst[j] ]->positions; aIt = a.begin(); while ( aIt != a.end() ) { if ( b.find( *aIt + 1 ) != b.end() ) { (*aIt)++; ++aIt; } else { aIt = a.remove( aIt ); } } } } if ( a.count() ) return TRUE; return FALSE; } #include "index.moc"