/*************************************************************************** copyright : (C) 2004-2006 by Robby Stephenson email : robby@periapsis.org ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of version 2 of the GNU General Public License as * * published by the Free Software Foundation; * * * ***************************************************************************/ #include "risimporter.h" #include "../collections/bibtexcollection.h" #include "../document.h" #include "../entry.h" #include "../field.h" #include "../latin1literal.h" #include "../progressmanager.h" #include "../filehandler.h" #include "../isbnvalidator.h" #include "../tellico_debug.h" #include <tdeapplication.h> #include <tqdict.h> #include <tqregexp.h> #include <tqmap.h> using Tellico::Import::RISImporter; TQMap<TQString, TQString>* RISImporter::s_tagMap = 0; TQMap<TQString, TQString>* RISImporter::s_typeMap = 0; // static void RISImporter::initTagMap() { if(!s_tagMap) { s_tagMap = new TQMap<TQString, TQString>(); // BT is special and is handled separately s_tagMap->insert(TQString::fromLatin1("TY"), TQString::fromLatin1("entry-type")); s_tagMap->insert(TQString::fromLatin1("ID"), TQString::fromLatin1("bibtex-key")); s_tagMap->insert(TQString::fromLatin1("T1"), TQString::fromLatin1("title")); s_tagMap->insert(TQString::fromLatin1("TI"), TQString::fromLatin1("title")); s_tagMap->insert(TQString::fromLatin1("T2"), TQString::fromLatin1("booktitle")); s_tagMap->insert(TQString::fromLatin1("A1"), TQString::fromLatin1("author")); s_tagMap->insert(TQString::fromLatin1("AU"), TQString::fromLatin1("author")); s_tagMap->insert(TQString::fromLatin1("ED"), TQString::fromLatin1("editor")); s_tagMap->insert(TQString::fromLatin1("YR"), TQString::fromLatin1("year")); s_tagMap->insert(TQString::fromLatin1("PY"), TQString::fromLatin1("year")); s_tagMap->insert(TQString::fromLatin1("N1"), TQString::fromLatin1("note")); s_tagMap->insert(TQString::fromLatin1("AB"), TQString::fromLatin1("abstract")); // should be note? s_tagMap->insert(TQString::fromLatin1("N2"), TQString::fromLatin1("abstract")); s_tagMap->insert(TQString::fromLatin1("KW"), TQString::fromLatin1("keyword")); s_tagMap->insert(TQString::fromLatin1("JF"), TQString::fromLatin1("journal")); s_tagMap->insert(TQString::fromLatin1("JO"), TQString::fromLatin1("journal")); s_tagMap->insert(TQString::fromLatin1("JA"), TQString::fromLatin1("journal")); s_tagMap->insert(TQString::fromLatin1("VL"), TQString::fromLatin1("volume")); s_tagMap->insert(TQString::fromLatin1("IS"), TQString::fromLatin1("number")); s_tagMap->insert(TQString::fromLatin1("PB"), TQString::fromLatin1("publisher")); s_tagMap->insert(TQString::fromLatin1("SN"), TQString::fromLatin1("isbn")); s_tagMap->insert(TQString::fromLatin1("AD"), TQString::fromLatin1("address")); s_tagMap->insert(TQString::fromLatin1("CY"), TQString::fromLatin1("address")); s_tagMap->insert(TQString::fromLatin1("UR"), TQString::fromLatin1("url")); s_tagMap->insert(TQString::fromLatin1("L1"), TQString::fromLatin1("pdf")); s_tagMap->insert(TQString::fromLatin1("T3"), TQString::fromLatin1("series")); s_tagMap->insert(TQString::fromLatin1("EP"), TQString::fromLatin1("pages")); } } // static void RISImporter::initTypeMap() { if(!s_typeMap) { s_typeMap = new TQMap<TQString, TQString>(); // leave capitalized, except for bibtex types s_typeMap->insert(TQString::fromLatin1("ABST"), TQString::fromLatin1("Abstract")); s_typeMap->insert(TQString::fromLatin1("ADVS"), TQString::fromLatin1("Audiovisual material")); s_typeMap->insert(TQString::fromLatin1("ART"), TQString::fromLatin1("Art Work")); s_typeMap->insert(TQString::fromLatin1("BILL"), TQString::fromLatin1("Bill/Resolution")); s_typeMap->insert(TQString::fromLatin1("BOOK"), TQString::fromLatin1("book")); // bibtex s_typeMap->insert(TQString::fromLatin1("CASE"), TQString::fromLatin1("Case")); s_typeMap->insert(TQString::fromLatin1("CHAP"), TQString::fromLatin1("inbook")); // == "inbook" ? s_typeMap->insert(TQString::fromLatin1("COMP"), TQString::fromLatin1("Computer program")); s_typeMap->insert(TQString::fromLatin1("CONF"), TQString::fromLatin1("inproceedings")); // == "conference" ? s_typeMap->insert(TQString::fromLatin1("CTLG"), TQString::fromLatin1("Catalog")); s_typeMap->insert(TQString::fromLatin1("DATA"), TQString::fromLatin1("Data file")); s_typeMap->insert(TQString::fromLatin1("ELEC"), TQString::fromLatin1("Electronic Citation")); s_typeMap->insert(TQString::fromLatin1("GEN"), TQString::fromLatin1("Generic")); s_typeMap->insert(TQString::fromLatin1("HEAR"), TQString::fromLatin1("Hearing")); s_typeMap->insert(TQString::fromLatin1("ICOMM"), TQString::fromLatin1("Internet Communication")); s_typeMap->insert(TQString::fromLatin1("INPR"), TQString::fromLatin1("In Press")); s_typeMap->insert(TQString::fromLatin1("JFULL"), TQString::fromLatin1("Journal (full)")); // = "periodical" ? s_typeMap->insert(TQString::fromLatin1("JOUR"), TQString::fromLatin1("article")); // "Journal" s_typeMap->insert(TQString::fromLatin1("MAP"), TQString::fromLatin1("Map")); s_typeMap->insert(TQString::fromLatin1("MGZN"), TQString::fromLatin1("article")); // bibtex s_typeMap->insert(TQString::fromLatin1("MPCT"), TQString::fromLatin1("Motion picture")); s_typeMap->insert(TQString::fromLatin1("MUSIC"), TQString::fromLatin1("Music score")); s_typeMap->insert(TQString::fromLatin1("NEWS"), TQString::fromLatin1("Newspaper")); s_typeMap->insert(TQString::fromLatin1("PAMP"), TQString::fromLatin1("Pamphlet")); // = "booklet" ? s_typeMap->insert(TQString::fromLatin1("PAT"), TQString::fromLatin1("Patent")); s_typeMap->insert(TQString::fromLatin1("PCOMM"), TQString::fromLatin1("Personal communication")); s_typeMap->insert(TQString::fromLatin1("RPRT"), TQString::fromLatin1("Report")); // = "techreport" ? s_typeMap->insert(TQString::fromLatin1("SER"), TQString::fromLatin1("Serial (BookMonograph)")); s_typeMap->insert(TQString::fromLatin1("SLIDE"), TQString::fromLatin1("Slide")); s_typeMap->insert(TQString::fromLatin1("SOUND"), TQString::fromLatin1("Sound recording")); s_typeMap->insert(TQString::fromLatin1("STAT"), TQString::fromLatin1("Statute")); s_typeMap->insert(TQString::fromLatin1("THES"), TQString::fromLatin1("phdthesis")); // "mastersthesis" ? s_typeMap->insert(TQString::fromLatin1("UNBILL"), TQString::fromLatin1("Unenacted bill/resolution")); s_typeMap->insert(TQString::fromLatin1("UNPB"), TQString::fromLatin1("unpublished")); // bibtex s_typeMap->insert(TQString::fromLatin1("VIDEO"), TQString::fromLatin1("Video recording")); } } RISImporter::RISImporter(const KURL::List& urls_) : Tellico::Import::Importer(urls_), m_coll(0), m_cancelled(false) { initTagMap(); initTypeMap(); } bool RISImporter::canImport(int type) const { return type == Data::Collection::Bibtex; } Tellico::Data::CollPtr RISImporter::collection() { if(m_coll) { return m_coll; } m_coll = new Data::BibtexCollection(true); TQDict<Data::Field> risFields; // need to know if any extended properties in current collection point to RIS // if so, add to collection Data::CollPtr currColl = Data::Document::self()->collection(); Data::FieldVec vec = currColl->fields(); for(Data::FieldVec::Iterator it = vec.begin(); it != vec.end(); ++it) { // continue if property is empty TQString ris = it->property(TQString::fromLatin1("ris")); if(ris.isEmpty()) { continue; } // if current collection has one with the same name, set the property Data::FieldPtr f = m_coll->fieldByName(it->name()); if(!f) { f = new Data::Field(*it); m_coll->addField(f); } f->setProperty(TQString::fromLatin1("ris"), ris); risFields.insert(ris, f); } ProgressItem& item = ProgressManager::self()->newProgressItem(this, progressLabel(), true); item.setTotalSteps(urls().count() * 100); connect(&item, TQT_SIGNAL(signalCancelled(ProgressItem*)), TQT_SLOT(slotCancel())); ProgressItem::Done done(this); int count = 0; KURL::List urls = this->urls(); for(KURL::List::ConstIterator it = urls.begin(); it != urls.end() && !m_cancelled; ++it, ++count) { readURL(*it, count, risFields); } if(m_cancelled) { m_coll = 0; } return m_coll; } void RISImporter::readURL(const KURL& url_, int n, const TQDict<Data::Field>& risFields_) { TQString str = FileHandler::readTextFile(url_); if(str.isEmpty()) { return; } ISBNValidator isbnval(this); TQTextIStream t(&str); const uint length = str.length(); const uint stepSize = TQMAX(s_stepSize, length/100); const bool showProgress = options() & ImportProgress; bool needToAddFinal = false; TQString sp, ep; uint j = 0; Data::EntryPtr entry = new Data::Entry(m_coll); // technically, the spec requires a space immediately after the hyphen // however, at least one website (Springer) outputs RIS with no space after the final "ER -" // so just strip the white space later // also be gracious and allow any amount of space before hyphen TQRegExp rx(TQString::fromLatin1("^(\\w\\w)\\s+-(.*)$")); TQString currLine, nextLine; for(currLine = t.readLine(); !m_cancelled && !currLine.isNull(); currLine = nextLine, j += currLine.length()) { nextLine = t.readLine(); rx.search(currLine); TQString tag = rx.cap(1); TQString value = rx.cap(2).stripWhiteSpace(); if(tag.isEmpty()) { continue; } // myDebug() << tag << ": " << value << endl; // if the next line is not empty and does not match start regexp, append to value while(!nextLine.isEmpty() && nextLine.find(rx) == -1) { value += nextLine.stripWhiteSpace(); nextLine = t.readLine(); } // every entry ends with "ER" if(tag == Latin1Literal("ER")) { m_coll->addEntries(entry); entry = new Data::Entry(m_coll); needToAddFinal = false; continue; } else if(tag == Latin1Literal("TY") && s_typeMap->contains(value)) { // for entry-type, switch it to normalized type name value = (*s_typeMap)[value]; } else if(tag == Latin1Literal("SN")) { // test for valid isbn, sometimes the issn gets stuck here int pos = 0; if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) { continue; } } else if(tag == Latin1Literal("SP")) { sp = value; if(!ep.isEmpty()) { value = sp + '-' + ep; tag = TQString::fromLatin1("EP"); sp = TQString(); ep = TQString(); } else { // nothing else to do continue; } } else if(tag == Latin1Literal("EP")) { ep = value; if(!sp.isEmpty()) { value = sp + '-' + ep; sp = TQString(); ep = TQString(); } else { continue; } } else if(tag == Latin1Literal("YR") || tag == Latin1Literal("PY")) { // for now, just grab the year value = value.section('/', 0, 0); } // the lookup scheme is: // 1. any field has an RIS property that matches the tag name // 2. default field mapping tag -> field name Data::FieldPtr f = risFields_.find(tag); if(!f) { // special case for BT // primary title for books, secondary for everything else if(tag == Latin1Literal("BT")) { if(entry->field(TQString::fromLatin1("entry-type")) == Latin1Literal("book")) { f = m_coll->fieldByName(TQString::fromLatin1("title")); } else { f = m_coll->fieldByName(TQString::fromLatin1("booktitle")); } } else { f = fieldByTag(tag); } } if(!f) { continue; } needToAddFinal = true; // harmless for non-choice fields // for entry-type, want it in lower case f->addAllowed(value); // if the field can have multiple values, append current values to new value if((f->flags() & Data::Field::AllowMultiple) && !entry->field(f->name()).isEmpty()) { value.prepend(entry->field(f->name()) + TQString::fromLatin1("; ")); } entry->setField(f, value); if(showProgress && j%stepSize == 0) { ProgressManager::self()->setProgress(this, n*100 + 100*j/length); kapp->processEvents(); } } if(needToAddFinal) { m_coll->addEntries(entry); } } Tellico::Data::FieldPtr RISImporter::fieldByTag(const TQString& tag_) { Data::FieldPtr f = 0; const TQString& fieldTag = (*s_tagMap)[tag_]; if(!fieldTag.isEmpty()) { f = m_coll->fieldByName(fieldTag); if(f) { f->setProperty(TQString::fromLatin1("ris"), tag_); return f; } } // add non-default fields if not already there if(tag_== Latin1Literal("L1")) { f = new Data::Field(TQString::fromLatin1("pdf"), i18n("PDF"), Data::Field::URL); f->setProperty(TQString::fromLatin1("ris"), TQString::fromLatin1("L1")); f->setCategory(i18n("Miscellaneous")); } m_coll->addField(f); return f; } void RISImporter::slotCancel() { m_cancelled = true; } bool RISImporter::maybeRIS(const KURL& url_) { TQString text = FileHandler::readTextFile(url_, true /*quiet*/); if(text.isEmpty()) { return false; } // bare bones check, strip white space at beginning // and then first text line must be valid RIS TQTextIStream t(&text); TQRegExp rx(TQString::fromLatin1("^(\\w\\w)\\s+-(.*)$")); TQString currLine; for(currLine = t.readLine(); !currLine.isNull(); currLine = t.readLine()) { if(currLine.stripWhiteSpace().isEmpty()) { continue; } break; } return rx.exactMatch(currLine); } #include "risimporter.moc"