diff options
Diffstat (limited to 'src/findduplicates.cpp')
-rw-r--r-- | src/findduplicates.cpp | 444 |
1 files changed, 444 insertions, 0 deletions
diff --git a/src/findduplicates.cpp b/src/findduplicates.cpp new file mode 100644 index 0000000..ddcf5a8 --- /dev/null +++ b/src/findduplicates.cpp @@ -0,0 +1,444 @@ +/*************************************************************************** + * Copyright (C) 2004-2009 by Thomas Fischer * + * [email protected] * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ +#include <math.h> + +#include <qstring.h> +#include <qstringlist.h> +#include <qregexp.h> +#include <qapplication.h> + +#include <kdebug.h> +#include <klocale.h> +#include <kmessagebox.h> +#include <kprogress.h> + +#include <element.h> +#include <entry.h> +#include <file.h> +#include <preamble.h> +#include <macro.h> +#include "findduplicates.h" + +#define max(a,b) ((a)>(b)?(a):(b)) +#define min(a,b) ((a)<(b)?(a):(b)) + +namespace KBibTeX +{ + const unsigned int FindDuplicates::maxDistance = 0xffffff; + + FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, QWidget *parent ) + : QObject( NULL, NULL ), m_doCancel( false ) + { + if ( file->count() < 2 ) + return; + + int len = file->count() * ( file->count() - 1 ) / 2; + unsigned int *distVector = new unsigned int[len]; + memset( distVector, 0xff, sizeof( unsigned int )*len ); + QMap<BibTeX::Element*, int> mapElementToIndex; + + QApplication::setOverrideCursor( Qt::waitCursor ); + KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true ); + connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) ); + progDlg->progressBar()->setTotalSteps( len ); + + determineDistances( file, distVector, mapElementToIndex, progDlg ); + progDlg->progressBar()->setValue( len ); + + if ( !m_doCancel ) + buildClique( result, file, distVector, mapElementToIndex, sensitivity ); + + delete progDlg; + delete[] distVector; + QApplication::restoreOverrideCursor(); + } + + /** + * Determine the distance between elements either from two different BibTeX + * files (merging operation) or within the same file (find duplicates). + * Inter-element distances will be written into the distance vector. + * @param file BibTeX file + * @param distVector inter-element distance vector + * @param mapElementToIndex map from elements to indices (will be written) + * @param progDlg progress dialog to write status information to + */ + void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg ) + { + int progress = 0, i = 0; + for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i ) + { + BibTeX::Entry *entryA = dynamic_cast<BibTeX::Entry*>( *it1 ); + if ( entryA != NULL ) + { + mapElementToIndex.insert( entryA, i ); + + int j = i + 1; + for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) + { + BibTeX::Entry *entryB = dynamic_cast<BibTeX::Entry*>( *it2 ); + if ( entryB == NULL ) continue; + + unsigned int d = entryDistance( entryA, entryB ); + distVector[arrayOffset( i, j )] = d; + + progDlg->progressBar()->setValue( ++progress ); + qApp->processEvents(); + } + } + else + { + BibTeX::Macro *macroA = dynamic_cast<BibTeX::Macro*>( *it1 ); + if ( macroA != NULL ) + { + mapElementToIndex.insert( macroA, i ); + + int j = i + 1; + for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) + { + BibTeX::Macro *macroB = dynamic_cast<BibTeX::Macro*>( *it2 ); + if ( macroB == NULL ) continue; + + distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB ); + + progDlg->progressBar()->setValue( ++progress ); + qApp->processEvents(); + } + } + else + { + BibTeX::Preamble *preambleA = dynamic_cast<BibTeX::Preamble*>( *it1 ); + if ( preambleA != NULL ) + { + mapElementToIndex.insert( preambleA, i ); + + int j = i + 1; + for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j ) + { + BibTeX::Preamble *preambleB = dynamic_cast<BibTeX::Preamble*>( *it2 ); + if ( preambleB == NULL ) continue; + + distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB ); + + progDlg->progressBar()->setValue( ++progress ); + qApp->processEvents(); + } + } + } + } + } + } + + /** + * Build a list of clique of BibTeX elements with a distance below the + * sensitivity threshold. The list of cliques is added to the cliqueList + * parameter. + * @param cliqueList List of cliques found in this function + * @param file BibTeX file + * @param distVector inter-element distance vector + * @param mapElementToIndex map from elements to indices + * @param sensitivity sensitivity threshold value + */ + void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, QMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity ) + { + int usedLen = file->count(); + bool* used = new bool[usedLen]; + memset( used, false, sizeof( bool ) * usedLen ); + QValueList<BibTeX::Element*> queue; + for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 ) + { + /** current element must be either entry, preamble, or macro */ + BibTeX::Element *elem1 = dynamic_cast<BibTeX::Entry*>( *it1 ); + if ( elem1 == NULL ) + elem1 = dynamic_cast<BibTeX::Macro*>( *it1 ); + if ( elem1 == NULL ) + elem1 = dynamic_cast<BibTeX::Preamble*>( *it1 ); + /** skip element otherwise or if already used */ + if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue; + + DuplicateClique clique; + + queue.clear(); + queue.append( elem1 ); + used[mapElementToIndex[elem1]] = true; + + while ( !queue.isEmpty() ) + { + elem1 = *( queue.begin() ); + queue.remove( queue.begin() ); + int curIndex = mapElementToIndex[elem1]; + clique.append( elem1 ); + + for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 ) + { + /** current element must be either entry, preamble, or macro */ + BibTeX::Element *elem2 = dynamic_cast<BibTeX::Entry*>( *it2 ); + int otherIndex=mapElementToIndex[elem2]; + if ( elem2 == NULL ) + elem2 = dynamic_cast<BibTeX::Macro*>( *it2 ); + if ( elem2 == NULL ) + elem2 = dynamic_cast<BibTeX::Preamble*>( *it2 ); + /** skip element otherwise or if already used */ + if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] ) + continue; + + unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )]; + + if ( distance <= sensitivity ) + { + queue.append( elem2 ); + used[otherIndex ] = true; + } + } + } + + if ( clique.size() > 1 ) + cliqueList.append( clique ); + } + delete[] used; + } + + /** + * Distance between two BibTeX entries, scaled by maxDistance. + */ + unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB ) + { + double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) ); + double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) ); + double yearValue = extractYear( entryA ) - extractYear( entryB ); + yearValue = min( 1.0, yearValue * yearValue / 100.0 ); + unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) ); + + return distance; + } + + /** + * Distance between two BibTeX macros, scaled by maxDistance. + */ + unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB ) + { + double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) ); + double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) ); + unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) ); + + return distance; + } + + unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB ) + { + return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) ); + } + + FindDuplicates::~FindDuplicates() + { +// nothing + } + + /** + * Determine the Levenshtein distance between two sentences, + * where each sentence is in a string (not split into single words). + * See also http://en.wikipedia.org/wiki/Levenshtein_distance + * @param s first sentence + * @param t second sentence + * @return distance between both sentences + */ + double FindDuplicates::levenshteinDistance( const QString &s, const QString &t ) + { + const QRegExp nonWordRegExp( "[^a-zA-Z']+" ); + if ( s == QString::null || t == QString::null ) return 1.0; + return levenshteinDistance( QStringList::split( nonWordRegExp, s ), QStringList::split( nonWordRegExp, t ) ); + } + + /** + * Determine the Levenshtein distance between two words. + * See also http://en.wikipedia.org/wiki/Levenshtein_distance + * @param s first word + * @param t second word + * @return distance between both words + */ + double FindDuplicates::levenshteinDistanceWord( const QString &s, const QString &t ) + { + QString mys = s.lower(), myt = t.lower(); + int m = s.length(), n = t.length(); + if ( m < 1 && n < 1 ) return 0.0; + if ( m < 1 || n < 1 ) return 1.0; + + int **d = new int*[m+1]; + for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;} + for ( int i = 0; i <= n; ++i ) d[0][i] = i; + + for ( int i = 1; i <= m;++i ) + for ( int j = 1; j <= n;++j ) + { + d[i][j] = d[i-1][j] + 1; + int c = d[i][j-1] + 1; + if ( c < d[i][j] ) d[i][j] = c; + c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 ); + if ( c < d[i][j] ) d[i][j] = c; + } + + double result = d[m][n]; + for ( int i = 0; i <= m; ++i ) delete[] d[i]; + delete [] d; + + result = result / ( double )max( m, n ); + result *= result; + return result; + } + + /** + * Determine the Levenshtein distance between two sentences (list of words). + * See also http://en.wikipedia.org/wiki/Levenshtein_distance + * @param s first sentence + * @param t second sentence + * @return distance between both sentences + */ + double FindDuplicates::levenshteinDistance( const QStringList &s, const QStringList &t ) + { + int m = s.size(), n = t.size(); + if ( m < 1 && n < 1 ) return 0.0; + if ( m < 1 || n < 1 ) return 1.0; + + double **d = new double*[m+1]; + for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;} + for ( int i = 0; i <= n; ++i ) d[0][i] = i; + + for ( int i = 1; i <= m;++i ) + for ( int j = 1; j <= n;++j ) + { + d[i][j] = d[i-1][j] + 1; + double c = d[i][j-1] + 1; + if ( c < d[i][j] ) d[i][j] = c; + c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] ); + if ( c < d[i][j] ) d[i][j] = c; + } + + double result = d[m][n]; + for ( int i = 0; i <= m; ++i ) delete[] d[i]; + delete [] d; + + result = result / ( double )max( m, n ); + + return result; + } + + /** + * Linearize a two-dimensional triangle matrix + */ + int FindDuplicates::arrayOffset( int a, int b ) + { + if ( a == b ) + return -1; + else if ( b < a ) + { + int swap = a; + a = b; + b = swap; + } + + return ( b * ( b - 1 ) / 2 + a ); + } + + /** + * Determine title for a given entry + */ + QString FindDuplicates::extractTitle( BibTeX::Entry *entry ) + { + /** retrieve field holding title information for entry */ + BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle ); + if ( field == NULL ) + return QString::null; /** no title field available */ + + /** *fetch value item holding title */ + BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first(); + if ( valueItem == NULL ) + return QString::null; /** no value item found or is empty */ + + return valueItem->text(); // TODO: Perform some postprocessing? + } + + /** + * Determine list of authors for a given entry + */ + QStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry ) + { + QStringList result; + + /** retrieve field holding authors information for entry */ + BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor ); + if ( field == NULL ) + return result; /** no author field available */ + + /** fetch container holding list of author names */ + BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast<BibTeX::PersonContainer*>( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL; + if ( personContainer == NULL || personContainer->persons.isEmpty() ) + return result; /** container not found or is empty */ + + /** iterate through container and fetch each author's last name */ + for ( QValueList<BibTeX::Person*>::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it ) + result.append(( *it )->lastName() ); + + return result; + } + + /** + * Determine year for a given entry + */ + int FindDuplicates::extractYear( BibTeX::Entry *entry ) + { + /** retrieve field holding year information for entry */ + BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear ); + if ( field == NULL ) + return -1; /** no year field available */ + + /** *fetch value item holding year */ + BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL; + if ( valueItem == NULL ) + return -1; /** no value item found or is empty */ + + /** parse value item's text */ + bool ok = FALSE; + int year = QString( valueItem->text() ).toInt( &ok ); + if ( !ok ) year = -1; + + return year; + } + + /** + * Determine key from a given macro + */ + QString FindDuplicates::extractMacroKey( BibTeX::Macro *macro ) + { + return macro->key(); + } + + /** + * Determine key from a given macro + */ + QString FindDuplicates::extractMacroValue( BibTeX::Macro *macro ) + { + return macro->value()->text(); + } + + void FindDuplicates::slotCancel() + { + m_doCancel = true; + } +} +#include "findduplicates.moc" |