diff options
Diffstat (limited to 'src/webqueryciteseerx.cpp')
-rw-r--r-- | src/webqueryciteseerx.cpp | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/src/webqueryciteseerx.cpp b/src/webqueryciteseerx.cpp new file mode 100644 index 0000000..d5d244e --- /dev/null +++ b/src/webqueryciteseerx.cpp @@ -0,0 +1,318 @@ +/*************************************************************************** + * Copyright (C) 2008 by Jacob Kanev <[email protected]>, * + * Thomas Fischer <[email protected]> * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ +#include <qfile.h> +#include <qregexp.h> +#include <qbuffer.h> +#include <qspinbox.h> + +#include <klocale.h> +#include <klineedit.h> +#include <kmessagebox.h> +#include <kurl.h> +#include <kdebug.h> + +#include <fileimporterbibtex.h> +#include <encoderxml.h> +#include <settings.h> +#include "webqueryciteseerx.h" + +using BibTeX::Value; +using BibTeX::Entry; +using BibTeX::EntryField; + +namespace KBibTeX +{ + + //_______________________________________________________________________________________________________________ + // Construct widget + + WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( QWidget *parent, const char *name ) + : WebQueryWidget( parent, name ) + { + init(); + + Settings *settings = Settings::self(); + QString value = settings->getWebQueryDefault( "CiteSeerX" ); + value = value == QString::null ? "" : value; + lineEditQuery->setText( value ); + slotTextChanged( value, true ); + } + + + //_______________________________________________________________________________________________________________ + // Construct + + WebQueryCiteSeerX::WebQueryCiteSeerX( QWidget* parent ) + : WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" ) + { + m_widget = new WebQueryCiteSeerXWidget( parent ); + } + + + //_______________________________________________________________________________________________________________ + // Destroy + + WebQueryCiteSeerX::~WebQueryCiteSeerX() + { + delete m_widget; + } + + + //_______________________________________________________________________________________________________________ + // GUI string + + QString WebQueryCiteSeerX::title() + { + return i18n( "CiteSeerX" ); + } + + + //_______________________________________________________________________________________________________________ + // GUI info + + QString WebQueryCiteSeerX::disclaimer() + { + return i18n( "About CiteSeerX" ); + } + + + //_______________________________________________________________________________________________________________ + // URL for disclaimer + + QString WebQueryCiteSeerX::disclaimerURL() + { + return "http://citeseerx.ist.psu.edu/about/site"; + } + + + //_______________________________________________________________________________________________________________ + // return pointer to widget + + WebQueryWidget *WebQueryCiteSeerX::widget() + { + return m_widget; + } + + + //_______________________________________________________________________________________________________________ + // user has pressed "Cancel" + + void WebQueryCiteSeerX::cancelQuery() + { + m_queryQueue.clear(); + } + + + //_______________________________________________________________________________________________________________ + // main function -- collects all queries for one search + + void WebQueryCiteSeerX::query() + { + // store CiteSeerX as future default + WebQuery::query(); + Settings *settings = Settings::self(); + settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() ); + + // read number of desired results from GUI + m_queryQueue.clear(); + m_desiredHits = m_widget->spinBoxMaxHits->value(); + // one for each entry, and one for each page of 10 links + setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) ); + + // prepare search term + QString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" ); + QStringList queryWords = QStringList::split( QRegExp( "\\s+" ), searchTerm ); + + if ( searchTerm.isEmpty() || queryWords.size() == 0 ) + { + setEndSearch( WebQuery::statusInvalidQuery ); + return; + } + + // build query from search term + QString query; + + for ( uint i = 0; i < queryWords.size(); ++i ) + { + if ( i ) query += " AND "; + + query += queryWords[i]; + } + + query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" ); + + // schedule jobs + DataRequest dr; + dr.url = KURL( QString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) ); + dr.parser = &WebQueryCiteSeerX::parseSummaryPage; + m_queryQueue.push_back( dr ); + + // start job queue + nextJob(); + } + + + //_______________________________________________________________________________________________________________ + // process results from current job + + void WebQueryCiteSeerX::parseSummaryPage( const QString& data ) + { + // regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937") + QRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" ); + + // count paper results and schedule single paper URLs + + for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) ) + { + if ( ++m_receivedHits > m_desiredHits ) + break; + + DataRequest dr; + + dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) ); + + dr.parser = &WebQueryCiteSeerX::parsePaperPage; + + m_queryQueue.push_back( dr ); + } + + // if we haven't reached the desired number of hits, schedule the next summary page + QRegExp nextSummaryXpr( "<a href=\"([^\"]+)\">Next 10" ); + + if ( m_receivedHits < m_desiredHits ) + if ( nextSummaryXpr.search( data ) >= 0 ) + { + DataRequest dr; + dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&", "&" ) ); + dr.parser = &WebQueryCiteSeerX::parseSummaryPage; + m_queryQueue.push_back( dr ); + } + } + + + //_______________________________________________________________________________________________________________ + // process the result of one single paper link + + void WebQueryCiteSeerX::parsePaperPage( const QString& data ) + { + // find type and id: @XXX{ YYY + QRegExp typeIdXpr( "@(.*)\\{(.*)," ); + typeIdXpr.setMinimal( true ); + typeIdXpr.search( data ); + QString typeStr = typeIdXpr.cap( 1 ); + QString id = typeIdXpr.cap( 2 ); + + // create entry + Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) ); + + // find abstract: <..>Abstract:</..> <..> XXX </..> + parseForSingleExpression( "<[^<]+>Abstract:</[^<]+>\\s*<[^<]+>([^<]+)</[^<]+>", data, entry, BibTeX::EntryField::ftAbstract ); + + // find title: title = {XXX} + parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle ); + + // find author: author = {XXX} + parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor ); + + // find year: year = {XXX} + parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear ); + + // find journal: journal = {XXX} + parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal ); + + // find pages: pages = {XXX} + parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages ); + + // publish what we've found + emit foundEntry( entry, false ); + } + + + //_______________________________________________________________________________________________________________ + // find single bibtex field in html page and add to entry + + void WebQueryCiteSeerX::parseForSingleExpression( QString description, const QString &data, Entry *entry, BibTeX::EntryField::FieldType type ) + { + // search, and add to entry if found + QRegExp xpr( description ); + + if ( xpr.search( data ) + 1 ) + { + EntryField *field = new EntryField( type ); + field->setValue( new Value( xpr.cap( 1 ), false ) ); + entry->addField( field ); + } + } + + + //_______________________________________________________________________________________________________________ + // read data from the job and start the current parser + + void WebQueryCiteSeerX::getData( KIO::Job *job ) + { + // advance GUI progress bar + enterNextStage(); + + if ( job && !job->error() && !m_aborted ) + { + + // read data + QBuffer data; + data.open( IO_WriteOnly ); + data.writeBlock( dynamic_cast<KIO::StoredTransferJob*>( job )->data() ); + data.close(); + data.open( IO_ReadOnly ); + QTextStream ts( &data ); + QString result = ts.read(); + data.close(); + + // hand the read data over to the parser + ( this->*m_currentParser )( result ); + } + + // proceed + nextJob(); + } + + + //_______________________________________________________________________________________________________________ + // call the next job + + void WebQueryCiteSeerX::nextJob() + { + // no more requests: finished + if ( !m_queryQueue.size() ) + { + setEndSearch( WebQuery::statusSuccess ); + m_receivedHits = 0; + } + // else: take the next request from queue and start it + else if ( !m_aborted ) + { + m_currentParser = m_queryQueue.front().parser; + KIO::Job *job = KIO::storedGet( m_queryQueue.front().url, FALSE, FALSE ); + m_queryQueue.pop_front(); + connect( job, SIGNAL( result( KIO::Job * ) ), this, SLOT( getData( KIO::Job * ) ) ); + } + } + +} + +#include "webqueryciteseerx.moc" |