summaryrefslogtreecommitdiffstats
path: root/src/webqueryciteseerx.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/webqueryciteseerx.cpp')
-rw-r--r--src/webqueryciteseerx.cpp318
1 files changed, 318 insertions, 0 deletions
diff --git a/src/webqueryciteseerx.cpp b/src/webqueryciteseerx.cpp
new file mode 100644
index 0000000..d5d244e
--- /dev/null
+++ b/src/webqueryciteseerx.cpp
@@ -0,0 +1,318 @@
+/***************************************************************************
+ * Copyright (C) 2008 by Jacob Kanev <[email protected]>, *
+ * Thomas Fischer <[email protected]> *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+#include <qfile.h>
+#include <qregexp.h>
+#include <qbuffer.h>
+#include <qspinbox.h>
+
+#include <klocale.h>
+#include <klineedit.h>
+#include <kmessagebox.h>
+#include <kurl.h>
+#include <kdebug.h>
+
+#include <fileimporterbibtex.h>
+#include <encoderxml.h>
+#include <settings.h>
+#include "webqueryciteseerx.h"
+
+using BibTeX::Value;
+using BibTeX::Entry;
+using BibTeX::EntryField;
+
+namespace KBibTeX
+{
+
+ //_______________________________________________________________________________________________________________
+ // Construct widget
+
+ WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( QWidget *parent, const char *name )
+ : WebQueryWidget( parent, name )
+ {
+ init();
+
+ Settings *settings = Settings::self();
+ QString value = settings->getWebQueryDefault( "CiteSeerX" );
+ value = value == QString::null ? "" : value;
+ lineEditQuery->setText( value );
+ slotTextChanged( value, true );
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // Construct
+
+ WebQueryCiteSeerX::WebQueryCiteSeerX( QWidget* parent )
+ : WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" )
+ {
+ m_widget = new WebQueryCiteSeerXWidget( parent );
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // Destroy
+
+ WebQueryCiteSeerX::~WebQueryCiteSeerX()
+ {
+ delete m_widget;
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // GUI string
+
+ QString WebQueryCiteSeerX::title()
+ {
+ return i18n( "CiteSeerX" );
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // GUI info
+
+ QString WebQueryCiteSeerX::disclaimer()
+ {
+ return i18n( "About CiteSeerX" );
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // URL for disclaimer
+
+ QString WebQueryCiteSeerX::disclaimerURL()
+ {
+ return "http://citeseerx.ist.psu.edu/about/site";
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // return pointer to widget
+
+ WebQueryWidget *WebQueryCiteSeerX::widget()
+ {
+ return m_widget;
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // user has pressed "Cancel"
+
+ void WebQueryCiteSeerX::cancelQuery()
+ {
+ m_queryQueue.clear();
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // main function -- collects all queries for one search
+
+ void WebQueryCiteSeerX::query()
+ {
+ // store CiteSeerX as future default
+ WebQuery::query();
+ Settings *settings = Settings::self();
+ settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() );
+
+ // read number of desired results from GUI
+ m_queryQueue.clear();
+ m_desiredHits = m_widget->spinBoxMaxHits->value();
+ // one for each entry, and one for each page of 10 links
+ setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) );
+
+ // prepare search term
+ QString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" );
+ QStringList queryWords = QStringList::split( QRegExp( "\\s+" ), searchTerm );
+
+ if ( searchTerm.isEmpty() || queryWords.size() == 0 )
+ {
+ setEndSearch( WebQuery::statusInvalidQuery );
+ return;
+ }
+
+ // build query from search term
+ QString query;
+
+ for ( uint i = 0; i < queryWords.size(); ++i )
+ {
+ if ( i ) query += " AND ";
+
+ query += queryWords[i];
+ }
+
+ query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" );
+
+ // schedule jobs
+ DataRequest dr;
+ dr.url = KURL( QString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) );
+ dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
+ m_queryQueue.push_back( dr );
+
+ // start job queue
+ nextJob();
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // process results from current job
+
+ void WebQueryCiteSeerX::parseSummaryPage( const QString& data )
+ {
+ // regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937")
+ QRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" );
+
+ // count paper results and schedule single paper URLs
+
+ for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) )
+ {
+ if ( ++m_receivedHits > m_desiredHits )
+ break;
+
+ DataRequest dr;
+
+ dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) );
+
+ dr.parser = &WebQueryCiteSeerX::parsePaperPage;
+
+ m_queryQueue.push_back( dr );
+ }
+
+ // if we haven't reached the desired number of hits, schedule the next summary page
+ QRegExp nextSummaryXpr( "<a href=\"([^\"]+)\">Next 10" );
+
+ if ( m_receivedHits < m_desiredHits )
+ if ( nextSummaryXpr.search( data ) >= 0 )
+ {
+ DataRequest dr;
+ dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&amp;", "&" ) );
+ dr.parser = &WebQueryCiteSeerX::parseSummaryPage;
+ m_queryQueue.push_back( dr );
+ }
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // process the result of one single paper link
+
+ void WebQueryCiteSeerX::parsePaperPage( const QString& data )
+ {
+ // find type and id: @XXX{ YYY
+ QRegExp typeIdXpr( "@(.*)\\{(.*)," );
+ typeIdXpr.setMinimal( true );
+ typeIdXpr.search( data );
+ QString typeStr = typeIdXpr.cap( 1 );
+ QString id = typeIdXpr.cap( 2 );
+
+ // create entry
+ Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) );
+
+ // find abstract: <..>Abstract:</..> <..> XXX </..>
+ parseForSingleExpression( "<[^<]+>Abstract:</[^<]+>\\s*<[^<]+>([^<]+)</[^<]+>", data, entry, BibTeX::EntryField::ftAbstract );
+
+ // find title: title = {XXX}
+ parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle );
+
+ // find author: author = {XXX}
+ parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor );
+
+ // find year: year = {XXX}
+ parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear );
+
+ // find journal: journal = {XXX}
+ parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal );
+
+ // find pages: pages = {XXX}
+ parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages );
+
+ // publish what we've found
+ emit foundEntry( entry, false );
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // find single bibtex field in html page and add to entry
+
+ void WebQueryCiteSeerX::parseForSingleExpression( QString description, const QString &data, Entry *entry, BibTeX::EntryField::FieldType type )
+ {
+ // search, and add to entry if found
+ QRegExp xpr( description );
+
+ if ( xpr.search( data ) + 1 )
+ {
+ EntryField *field = new EntryField( type );
+ field->setValue( new Value( xpr.cap( 1 ), false ) );
+ entry->addField( field );
+ }
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // read data from the job and start the current parser
+
+ void WebQueryCiteSeerX::getData( KIO::Job *job )
+ {
+ // advance GUI progress bar
+ enterNextStage();
+
+ if ( job && !job->error() && !m_aborted )
+ {
+
+ // read data
+ QBuffer data;
+ data.open( IO_WriteOnly );
+ data.writeBlock( dynamic_cast<KIO::StoredTransferJob*>( job )->data() );
+ data.close();
+ data.open( IO_ReadOnly );
+ QTextStream ts( &data );
+ QString result = ts.read();
+ data.close();
+
+ // hand the read data over to the parser
+ ( this->*m_currentParser )( result );
+ }
+
+ // proceed
+ nextJob();
+ }
+
+
+ //_______________________________________________________________________________________________________________
+ // call the next job
+
+ void WebQueryCiteSeerX::nextJob()
+ {
+ // no more requests: finished
+ if ( !m_queryQueue.size() )
+ {
+ setEndSearch( WebQuery::statusSuccess );
+ m_receivedHits = 0;
+ }
+ // else: take the next request from queue and start it
+ else if ( !m_aborted )
+ {
+ m_currentParser = m_queryQueue.front().parser;
+ KIO::Job *job = KIO::storedGet( m_queryQueue.front().url, FALSE, FALSE );
+ m_queryQueue.pop_front();
+ connect( job, SIGNAL( result( KIO::Job * ) ), this, SLOT( getData( KIO::Job * ) ) );
+ }
+ }
+
+}
+
+#include "webqueryciteseerx.moc"