/***************************************************************************
 *   Copyright (C) 2004-2009 by Thomas Fischer                             *
 *   fischer@unix-ag.uni-kl.de                                             *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/
#include <math.h>

#include <tqstring.h>
#include <tqstringlist.h>
#include <tqregexp.h>
#include <tqapplication.h>

#include <kdebug.h>
#include <tdelocale.h>
#include <tdemessagebox.h>
#include <kprogress.h>

#include "element.h"
#include "entry.h"
#include "file.h"
#include "preamble.h"
#include "macro.h"
#include "findduplicates.h"

#define max(a,b) ((a)>(b)?(a):(b))
#define min(a,b) ((a)<(b)?(a):(b))

namespace KBibTeX
{
    const unsigned int FindDuplicates::maxDistance = 0xffffff;

    FindDuplicates::FindDuplicates( DuplicateCliqueList &result, unsigned int sensitivity, BibTeX::File *file, TQWidget *parent )
            : TQObject( NULL, NULL ), m_doCancel( false )
    {
        if ( file->count() < 2 )
            return;

        int len = file->count() * ( file->count() - 1 ) / 2;
        unsigned int *distVector = new unsigned int[len];
        memset( distVector, 0xff, sizeof( unsigned int )*len );
        TQMap<BibTeX::Element*, int> mapElementToIndex;

        TQApplication::setOverrideCursor( TQt::waitCursor );
        KProgressDialog *progDlg = new KProgressDialog( parent, NULL, i18n( "Find Duplicates" ), i18n( "Searching for duplicates..." ), true );
        connect( progDlg, SIGNAL( cancelClicked() ), this, SLOT( slotCancel() ) );
        progDlg->progressBar()->setTotalSteps( len );

        determineDistances( file, distVector, mapElementToIndex, progDlg );
        progDlg->progressBar()->setValue( len );

        if ( !m_doCancel )
            buildClique( result, file, distVector, mapElementToIndex, sensitivity );

        delete progDlg;
        delete[] distVector;
        TQApplication::restoreOverrideCursor();
    }

    /**
     * Determine the distance between elements either from two different BibTeX
     * files (merging operation) or within the same file (find duplicates).
     * Inter-element distances will be written into the distance vector.
     * @param file BibTeX file
     * @param distVector inter-element distance vector
     * @param mapElementToIndex map from elements to indices (will be written)
     * @param progDlg progress dialog to write status information to
     */
    void FindDuplicates::determineDistances( BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, KProgressDialog *progDlg )
    {
        int progress = 0, i = 0;
        for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); !m_doCancel && it1 != file->constEnd(); ++it1, ++i )
        {
            BibTeX::Entry *entryA = dynamic_cast<BibTeX::Entry*>( *it1 );
            if ( entryA != NULL )
            {
                mapElementToIndex.insert( entryA, i );

                int j = i + 1;
                for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
                {
                    BibTeX::Entry *entryB = dynamic_cast<BibTeX::Entry*>( *it2 );
                    if ( entryB == NULL ) continue;

                    unsigned int d = entryDistance( entryA, entryB );
                    distVector[arrayOffset( i, j )] = d;

                    progDlg->progressBar()->setValue( ++progress );
                    tqApp->processEvents();
                }
            }
            else
            {
                BibTeX::Macro *macroA = dynamic_cast<BibTeX::Macro*>( *it1 );
                if ( macroA != NULL )
                {
                    mapElementToIndex.insert( macroA, i );

                    int j = i + 1;
                    for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
                    {
                        BibTeX::Macro *macroB = dynamic_cast<BibTeX::Macro*>( *it2 );
                        if ( macroB == NULL ) continue;

                        distVector[arrayOffset( i, j )] = macroDistance( macroA, macroB );

                        progDlg->progressBar()->setValue( ++progress );
                        tqApp->processEvents();
                    }
                }
                else
                {
                    BibTeX::Preamble *preambleA = dynamic_cast<BibTeX::Preamble*>( *it1 );
                    if ( preambleA != NULL )
                    {
                        mapElementToIndex.insert( preambleA, i );

                        int j = i + 1;
                        for ( BibTeX::File::ElementList::ConstIterator it2 = ++BibTeX::File::ElementList::ConstIterator( it1 ); !m_doCancel && it2 != file->constEnd(); ++it2, ++j )
                        {
                            BibTeX::Preamble *preambleB = dynamic_cast<BibTeX::Preamble*>( *it2 );
                            if ( preambleB == NULL ) continue;

                            distVector[arrayOffset( i, j )] = preambleDistance( preambleA, preambleB );

                            progDlg->progressBar()->setValue( ++progress );
                            tqApp->processEvents();
                        }
                    }
                }
            }
        }
    }

    /**
     * Build a list of clique of BibTeX elements with a distance below the
     * sensitivity threshold. The list of cliques is added to the cliqueList
     * parameter.
     * @param cliqueList List of cliques found in this function
     * @param file BibTeX file
     * @param distVector inter-element distance vector
     * @param mapElementToIndex map from elements to indices
     * @param sensitivity sensitivity threshold value
     */
    void FindDuplicates::buildClique( DuplicateCliqueList &cliqueList, BibTeX::File *file, unsigned int *distVector, TQMap<BibTeX::Element*, int> &mapElementToIndex, unsigned int sensitivity )
    {
        int usedLen = file->count();
        bool* used = new bool[usedLen];
        memset( used, false, sizeof( bool ) * usedLen );
        TQValueList<BibTeX::Element*> queue;
        for ( BibTeX::File::ElementList::ConstIterator it1 = file->constBegin(); it1 != file->constEnd(); ++it1 )
        {
            /** current element must be either entry, preamble, or macro */
            BibTeX::Element *elem1 = dynamic_cast<BibTeX::Entry*>( *it1 );
            if ( elem1 == NULL )
                elem1 = dynamic_cast<BibTeX::Macro*>( *it1 );
            if ( elem1 == NULL )
                elem1 = dynamic_cast<BibTeX::Preamble*>( *it1 );
            /** skip element otherwise or if already used */
            if ( elem1 == NULL || used[mapElementToIndex[elem1]] ) continue;

            DuplicateClique clique;

            queue.clear();
            queue.append( elem1 );
            used[mapElementToIndex[elem1]] = true;

            while ( !queue.isEmpty() )
            {
                elem1 = *( queue.begin() );
                queue.remove( queue.begin() );
                int curIndex = mapElementToIndex[elem1];
                clique.append( elem1 );

                for ( BibTeX::File::ElementList::ConstIterator it2 = file->constBegin(); it2 != file->constEnd(); ++it2 )
                {
                    /** current element must be either entry, preamble, or macro */
                    BibTeX::Element *elem2 = dynamic_cast<BibTeX::Entry*>( *it2 );
                    int otherIndex=mapElementToIndex[elem2];
                    if ( elem2 == NULL )
                        elem2 = dynamic_cast<BibTeX::Macro*>( *it2 );
                    if ( elem2 == NULL )
                        elem2 = dynamic_cast<BibTeX::Preamble*>( *it2 );
                    /** skip element otherwise or if already used */
                    if ( elem2 == NULL || used[( otherIndex = mapElementToIndex[elem2] )] )
                        continue;

                    unsigned int distance = distVector[arrayOffset( curIndex, otherIndex )];

                    if ( distance <= sensitivity )
                    {
                        queue.append( elem2 );
                        used[otherIndex ] = true;
                    }
                }
            }

            if ( clique.size() > 1 )
                cliqueList.append( clique );
        }
        delete[] used;
    }

    /**
    * Distance between two BibTeX entries, scaled by maxDistance.
    */
    unsigned int FindDuplicates::entryDistance( BibTeX::Entry *entryA, BibTeX::Entry *entryB )
    {
        double titleValue = levenshteinDistance( extractTitle( entryA ), extractTitle( entryB ) );
        double authorValue = levenshteinDistance( authorsLastName( entryA ), authorsLastName( entryB ) );
        double yearValue = extractYear( entryA ) - extractYear( entryB );
        yearValue = min( 1.0, yearValue * yearValue / 100.0 );
        unsigned int distance = ( unsigned int )( maxDistance * ( titleValue * 0.6 + authorValue * 0.3 + yearValue * 0.1 ) );

        return distance;
    }

    /**
    * Distance between two BibTeX macros, scaled by maxDistance.
    */
    unsigned int FindDuplicates::macroDistance( BibTeX::Macro *macroA, BibTeX::Macro *macroB )
    {
        double keyValue = levenshteinDistance( extractMacroKey( macroA ), extractMacroKey( macroB ) );
        double valueValue = levenshteinDistance( extractMacroValue( macroA ), extractMacroValue( macroB ) );
        unsigned int distance = ( unsigned int )( maxDistance * ( keyValue * 0.7 + valueValue * 0.3 ) );

        return distance;
    }

    unsigned int FindDuplicates::preambleDistance( BibTeX::Preamble *preambleA, BibTeX::Preamble *preambleB )
    {
        return ( unsigned int )( maxDistance * levenshteinDistance( preambleA->value()->text(), preambleB->value()->text() ) );
    }

    FindDuplicates::~FindDuplicates()
    {
// nothing
    }

    /**
     * Determine the Levenshtein distance between two sentences,
     * where each sentence is in a string (not split into single words).
     * See also http://en.wikipedia.org/wiki/Levenshtein_distance
     * @param s first sentence
     * @param t second sentence
     * @return distance between both sentences
     */
    double FindDuplicates::levenshteinDistance( const TQString &s, const TQString &t )
    {
        const TQRegExp nonWordRegExp( "[^a-zA-Z']+" );
        if ( s == TQString::null || t == TQString::null ) return 1.0;
        return levenshteinDistance( TQStringList::split( nonWordRegExp, s ), TQStringList::split( nonWordRegExp, t ) );
    }

    /**
     * Determine the Levenshtein distance between two words.
     * See also http://en.wikipedia.org/wiki/Levenshtein_distance
     * @param s first word
     * @param t second word
     * @return distance between both words
     */
    double FindDuplicates::levenshteinDistanceWord( const TQString &s, const TQString &t )
    {
        TQString mys = s.lower(), myt = t.lower();
        int m = s.length(), n = t.length();
        if ( m < 1 && n < 1 ) return 0.0;
        if ( m < 1 || n < 1 ) return 1.0;

        int **d = new int*[m+1];
        for ( int i = 0; i <= m; ++i ) {d[i] = new int[n+1]; d[i][0] = i;}
        for ( int i = 0; i <= n; ++i ) d[0][i] = i;

        for ( int i = 1; i <= m;++i )
            for ( int j = 1; j <= n;++j )
            {
                d[i][j] = d[i-1][j] + 1;
                int c = d[i][j-1] + 1;
                if ( c < d[i][j] ) d[i][j] = c;
                c = d[i-1][j-1] + ( mys[i-1] == myt[j-1] ? 0 : 1 );
                if ( c < d[i][j] ) d[i][j] = c;
            }

        double result = d[m][n];
        for ( int i = 0; i <= m; ++i ) delete[] d[i];
        delete [] d;

        result = result / ( double )max( m, n );
        result *= result;
        return result;
    }

    /**
     * Determine the Levenshtein distance between two sentences (list of words).
     * See also http://en.wikipedia.org/wiki/Levenshtein_distance
     * @param s first sentence
     * @param t second sentence
     * @return distance between both sentences
     */
    double FindDuplicates::levenshteinDistance( const TQStringList &s, const TQStringList &t )
    {
        int m = s.size(), n = t.size();
        if ( m < 1 && n < 1 ) return 0.0;
        if ( m < 1 || n < 1 ) return 1.0;

        double **d = new double*[m+1];
        for ( int i = 0; i <= m; ++i ) {d[i] = new double[n+1]; d[i][0] = i;}
        for ( int i = 0; i <= n; ++i ) d[0][i] = i;

        for ( int i = 1; i <= m;++i )
            for ( int j = 1; j <= n;++j )
            {
                d[i][j] = d[i-1][j] + 1;
                double c = d[i][j-1] + 1;
                if ( c < d[i][j] ) d[i][j] = c;
                c = d[i-1][j-1] + levenshteinDistanceWord( s[i-1], t[j-1] );
                if ( c < d[i][j] ) d[i][j] = c;
            }

        double result = d[m][n];
        for ( int i = 0; i <= m; ++i ) delete[] d[i];
        delete [] d;

        result = result / ( double )max( m, n );

        return result;
    }

    /**
    * Linearize a two-dimensional triangle matrix
    */
    int FindDuplicates::arrayOffset( int a, int b )
    {
        if ( a == b )
            return -1;
        else if ( b < a )
        {
            int swap = a;
            a = b;
            b = swap;
        }

        return ( b * ( b - 1 ) / 2 + a );
    }

    /**
    * Determine title for a given entry
    */
    TQString FindDuplicates::extractTitle( BibTeX::Entry *entry )
    {
        /** retrieve field holding title information for entry */
        BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftTitle );
        if ( field == NULL )
            return TQString::null; /** no title field available */

        /** *fetch value item holding title */
        BibTeX::ValueItem *valueItem = field->value()->items.isEmpty() ? NULL : field->value()->items.first();
        if ( valueItem == NULL )
            return TQString::null; /** no value item found or is empty */

        return valueItem->text(); // TODO: Perform some postprocessing?
    }

    /**
    * Determine list of authors for a given entry
    */
    TQStringList FindDuplicates::authorsLastName( BibTeX::Entry *entry )
    {
        TQStringList result;

        /** retrieve field holding authors information for entry */
        BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftAuthor );
        if ( field == NULL )
            return result; /** no author field available */

        /** fetch container holding list of author names */
        BibTeX::PersonContainer *personContainer = field != NULL ? dynamic_cast<BibTeX::PersonContainer*>( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
        if ( personContainer == NULL || personContainer->persons.isEmpty() )
            return result; /** container not found or is empty */

        /** iterate through container and fetch each author's last name */
        for ( TQValueList<BibTeX::Person*>::ConstIterator it = personContainer->persons.begin(); it != personContainer->persons.end(); ++it )
            result.append(( *it )->lastName() );

        return result;
    }

    /**
    * Determine year for a given entry
    */
    int FindDuplicates::extractYear( BibTeX::Entry *entry )
    {
        /** retrieve field holding year information for entry */
        BibTeX::EntryField *field = entry->getField( BibTeX::EntryField::ftYear );
        if ( field == NULL )
            return -1; /** no year field available */

        /** *fetch value item holding year */
        BibTeX::ValueItem *valueItem = field != NULL ? ( field->value()->items.isEmpty() ? NULL : field->value()->items.first() ) : NULL;
        if ( valueItem == NULL )
            return -1; /** no value item found or is empty */

        /** parse value item's text */
        bool ok = FALSE;
        int year = TQString( valueItem->text() ).toInt( &ok );
        if ( !ok ) year = -1;

        return year;
    }

    /**
    * Determine key from a given macro
    */
    TQString FindDuplicates::extractMacroKey( BibTeX::Macro *macro )
    {
        return macro->key();
    }

    /**
    * Determine key from a given macro
    */
    TQString FindDuplicates::extractMacroValue( BibTeX::Macro *macro )
    {
        return macro->value()->text();
    }

    void FindDuplicates::slotCancel()
    {
        m_doCancel = true;
    }
}
#include "findduplicates.moc"