Added KDE3 version of Tellico

git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/tellico@1097620 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
author: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-03-01 19:17:32 +0000
committer: tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2010-03-01 19:17:32 +0000
commit: e38d2351b83fa65c66ccde443777647ef5cb6cff (patch)
tree: 1897fc20e9f73a81c520a5b9f76f8ed042124883 /src/fetch/scripts
download: tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.tar.gz
tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.zip
9 files changed, 1622 insertions, 0 deletions
diff --git a/src/fetch/scripts/Makefile.am b/src/fetch/scripts/Makefile.am
new file mode 100644
index 0000000..050c460
--- /dev/null
+++ b/src/fetch/scripts/Makefile.am
@@ -0,0 +1,30 @@
+####### kdevelop will overwrite this part!!! (begin)##########
+
+EXTRA_DIST = \
+fr.allocine.py fr.allocine.py.spec \
+ministerio_de_cultura.py ministerio_de_cultura.py.spec \
+dark_horse_comics.py dark_horse_comics.py.spec \
+boardgamegeek.rb boardgamegeek.rb.spec
+
+####### kdevelop will overwrite this part!!! (end)############
+
+scriptdir = $(kde_datadir)/tellico/data-sources
+script_SCRIPTS = \
+fr.allocine.py \
+ministerio_de_cultura.py \
+dark_horse_comics.py \
+boardgamegeek.rb
+
+script_DATA = \
+fr.allocine.py.spec \
+ministerio_de_cultura.py.spec \
+dark_horse_comics.py.spec \
+boardgamegeek.rb.spec
+
+KDE_OPTIONS = noautodist
+
+CLEANFILES = *~
+
+# probably a better way to do this
+uninstall-hook:
+	-if [ -d $(scriptdir) ]; then rmdir $(scriptdir); fi
diff --git a/src/fetch/scripts/boardgamegeek.rb b/src/fetch/scripts/boardgamegeek.rb
new file mode 100644
index 0000000..b3cf4f3
--- /dev/null
+++ b/src/fetch/scripts/boardgamegeek.rb
@@ -0,0 +1,235 @@
+#!/usr/bin/env ruby
+#
+# ***************************************************************************
+#    copyright            : (C) 2006 by Steve Beattie
+#                         : (C) 2008 by Sven Werlen
+#    email                : [email protected]
+#                         : [email protected]
+# ***************************************************************************
+#
+# ***************************************************************************
+# *                                                                         *
+# *   This program is free software; you can redistribute it and/or modify  *
+# *   it under the terms of version 2 of the GNU General Public License as  *
+# *   published by the Free Software Foundation;                            *
+# *                                                                         *
+# ***************************************************************************
+
+# $Id: boardgamegeek.rb 313 2006-10-02 22:17:11Z steve $
+
+# This program is expected to be invoked from tellico
+# (http://periapsis.org/tellico) as an external data source. It provides
+# searches for boardgames from the boardgamegeek.com website, via
+# boardgamegeek's xmlapi interface
+# (http://www.boardgamegeek.com/xmlapi/)
+#
+# It only allows searches via name; the boardgamegeek xmlapi is not yet
+# rich enough to support queries by designer, publisher, category, or
+# mechanism. I'd like to add support for querying by boardgamegeek id,
+# but that needs additional support in tellico.
+#
+# Sven Werlen: 03 Feb 2008: script has been extended to retrieve cover
+# images (/thumbnail from xmlapi). Images are retrieved from the website
+# and base64 is generated on-the-fly.
+#
+require 'rexml/document'
+require 'net/http'
+require 'cgi'
+require "base64"
+include REXML
+
+$my_version = '$Rev: 313 $'
+
+class Game
+  attr_writer :year
+  attr_writer :description
+  attr_writer :cover
+  attr_writer :image
+
+  def initialize(name, id)
+    @name = name
+    @id = id
+    @publishers = []
+    @designers = []
+    @players = []
+  end
+
+  def add_publisher(publisher)
+    @publishers << publisher
+  end
+
+  def add_designer(designer)
+    @designers << designer
+  end
+
+  def add_players(players)
+    @players << players
+  end
+
+  def to_s()
+    "@name (#@id #@publishers #@year)"
+  end
+
+  def toXML()
+    element = Element.new 'entry'
+    element.add_element Element.new('title').add_text(@name)
+    element.add_element Element.new('description').add_text(@description) if @description
+    element.add_element Element.new('year').add_text(@year) if @year
+    element.add_element Element.new('boardgamegeek-link').add_text("http://www.boardgamegeek/game/#{@id}") if @id
+    element.add_element Element.new('bggid').add_text(@id) if @id
+    element.add_element Element.new('cover').add_text(@cover) if @cover
+  
+    if @publishers.length > 0
+      pub_elements = Element.new('publishers')
+      @publishers.each {|p| pub_elements.add_element Element.new('publisher').add_text(p)}
+      element.add_element pub_elements
+    end
+    if @designers.length > 0
+      des_elements = Element.new('designers')
+      @designers.each {|d| des_elements.add_element Element.new('designer').add_text(d)}
+      element.add_element des_elements
+    end
+    if @players.length > 0
+      players_elements = Element.new('num-players')
+      @players.each {|n| players_elements.add_element Element.new('num-player').add_text(n.to_s)}
+      element.add_element players_elements
+    end
+    return element
+  end
+  
+  def image()
+    image = Element.new 'image'
+    image.add_attribute('format', 'JPEG')
+    image.add_attribute('id', @id + ".jpg")
+    image.add_text(@image)
+    return image
+  end
+end
+
+def getGameList(query)
+  #puts("Query is #{query}")
+
+  search_result = nil
+  Net::HTTP.start('www.boardgamegeek.com', 80) do
+    |http| search_result = (http.get("/xmlapi/search?search=#{CGI.escape(query)}",
+                                    {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"}).body)
+      http.finish
+  end
+  doc = REXML::Document.new(search_result)
+
+  games = XPath.match(doc, "//game")
+  #games.each {|g| puts g.elements['name'].text+g.attributes['gameid']}
+  ids = []
+  games.each {|g| ids << g.attributes['gameid']}
+  return ids
+end
+
+def getGameDetails(ids)
+  #ids.each {|id| puts id}
+
+  query = "/xmlapi/game/#{ids.join(',')}"
+  #puts query
+  search_result = nil
+  Net::HTTP.start('www.boardgamegeek.com', 80) do |http|
+    search_result = http.get(query, {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"})
+    http.finish
+  end
+  games = []
+  case search_result
+  when Net::HTTPOK then
+    doc = REXML::Document.new(search_result.body)
+
+    games_xml = XPath.match(doc, "//game")
+    games_xml.each do |g| 
+      if( g.elements['name'] != nil )
+        game = Game.new(g.elements['name'].text, g.attributes['gameid'])
+        game.year = g.elements['yearpublished'].text
+        game.description = g.elements['description'].text
+        g.elements.each('publisher'){|p| game.add_publisher p.elements['name'].text}
+        g.elements.each('designer'){|d| game.add_designer d.elements['name'].text}
+        minp = Integer(g.elements['minplayers'].text)
+        maxp = Integer(g.elements['maxplayers'].text)
+        minp.upto(maxp) {|n| game.add_players(n)}
+        
+        # retrieve cover
+        coverurl = g.elements['thumbnail'] != nil ? g.elements['thumbnail'].text : nil
+        if( coverurl =~ /files.boardgamegeek.com(.*)$/ )
+          # puts "downloading... " + $1
+          cover = nil
+          Net::HTTP.start('files.boardgamegeek.com', 80) do |http| 
+            cover = (http.get($1, {"User-Agent" => "BoardGameGeek plugin for Tellico #{$my_version}"}))
+          end
+          case cover
+          when Net::HTTPOK then
+            game.cover = g.attributes['gameid'] + ".jpg";
+            game.image = Base64.encode64(cover.body);
+          end
+        else
+          # puts "invalid cover: " + coverurl
+        end
+        games << game
+      end
+    end
+  end
+  return games
+end
+
+def listToXML(gameList)
+  doc = REXML::Document.new
+  doc << REXML::DocType.new('tellico PUBLIC', '"-//Robby Stephenson/DTD Tellico V10.0//EN" "http://periapsis.org/tellico/dtd/v10/tellico.dtd"')
+  doc << XMLDecl.new
+  tellico = Element.new 'tellico'
+  tellico.add_attribute('xmlns', 'http://periapsis.org/tellico/')
+  tellico.add_attribute('syntaxVersion', '10')
+  collection = Element.new 'collection'
+  collection.add_attribute('title', 'My Collection')
+  collection.add_attribute('type', '13')
+
+  fields = Element.new 'fields'
+  field = Element.new 'field'
+  field.add_attribute('name', '_default')
+  fields.add_element(field)
+  field = Element.new 'field'
+  field.add_attribute('name', 'bggid')
+  field.add_attribute('title', 'BoardGameGeek ID')
+  field.add_attribute('category', 'General')
+  field.add_attribute('flags', '0')
+  field.add_attribute('format', '4')
+  field.add_attribute('type', '6')
+  field.add_attribute('i18n', 'true')
+  fields.add_element(field)
+  collection.add_element(fields)
+
+  images = Element.new 'images'
+
+  id = 0
+  gameList.each do
+    |g| element = g.toXML()
+        element.add_attribute('id', id)
+        id = id + 1
+      collection.add_element(element)
+      images.add_element(g.image());
+  end
+  collection.add_element(images);  
+  tellico.add_element(collection)
+  doc.add_element(tellico)
+  doc.write($stdout, 0)
+  puts ""
+end
+
+if __FILE__ == $0
+
+  def showUsage
+    warn "usage: #{__FILE__} game_query"
+    exit 1
+  end
+
+  showUsage unless ARGV.length == 1
+
+  idList = getGameList(ARGV.shift)
+  if idList
+    gameList = getGameDetails(idList)
+  end
+
+  listToXML(gameList)
+end
diff --git a/src/fetch/scripts/boardgamegeek.rb.spec b/src/fetch/scripts/boardgamegeek.rb.spec
new file mode 100644
index 0000000..6e0aab0
--- /dev/null
+++ b/src/fetch/scripts/boardgamegeek.rb.spec
@@ -0,0 +1,7 @@
+Name=BoardGameGeek
+Type=data-source
+ArgumentKeys=1
+Arguments=%1
+CollectionType=13
+FormatType=0
+UpdateArgs=%{title}
diff --git a/src/fetch/scripts/dark_horse_comics.py b/src/fetch/scripts/dark_horse_comics.py
new file mode 100644
index 0000000..4f3b651
--- /dev/null
+++ b/src/fetch/scripts/dark_horse_comics.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+
+# ***************************************************************************
+#    copyright            : (C) 2006 by Mathias Monnerville
+#    email                : [email protected]
+# ***************************************************************************
+#
+# ***************************************************************************
+# *                                                                         *
+# *   This program is free software; you can redistribute it and/or modify  *
+# *   it under the terms of version 2 of the GNU General Public License as  *
+# *   published by the Free Software Foundation;                            *
+# *                                                                         *
+# ***************************************************************************
+
+# $Id: comics_darkhorsecomics.py 123 2006-03-24 08:47:48Z mathias $
+
+"""
+This script has to be used with tellico (http://periapsis.org/tellico) as an external data source program.
+It allows searching through the Dark Horse Comics web database.
+
+Related info and cover are fetched automatically. It takes only one argument (comic title).
+
+Tellico data source setup:
+- source name: Dark Horse Comics (US) (or whatever you want :)
+- Collection type: comics collection
+- Result type: tellico
+- Path: /path/to/script/comics_darkhorsecomics.py
+- Arguments:
+Title (checked) = %1
+Update (checked) = %{title}
+"""
+
+import sys, os, re, md5, random, string
+import urllib, urllib2, time, base64
+import xml.dom.minidom
+
+XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
+DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
+NULLSTRING = ''
+
+VERSION = "0.2"
+
+
+def genMD5():
+	"""
+	Generates and returns a random md5 string. Its main purpose is to allow random 
+	image file name generation.
+	"""
+	obj = md5.new()
+	float = random.random()
+	obj.update(str(float))
+	return obj.hexdigest()
+
+class BasicTellicoDOM:
+	"""
+	This class manages tellico's XML data model (DOM)
+	"""
+	def __init__(self):
+		self.__doc = xml.dom.minidom.Document()
+		self.__root = self.__doc.createElement('tellico')
+		self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
+		self.__root.setAttribute('syntaxVersion', '9')
+		
+		self.__collection = self.__doc.createElement('collection')
+		self.__collection.setAttribute('title', 'My Comics')
+		self.__collection.setAttribute('type', '6')
+
+		self.__images = self.__doc.createElement('images')
+
+		self.__root.appendChild(self.__collection)
+		self.__doc.appendChild(self.__root)
+
+		# Current movie id. See entry's id attribute in self.addEntry()
+		self.__currentId = 0
+
+
+	def addEntry(self, movieData):
+		"""
+		Add a comic entry. 
+		Returns an entry node instance
+		"""
+		d = movieData
+		entryNode = self.__doc.createElement('entry')
+		entryNode.setAttribute('id', str(self.__currentId))
+
+		titleNode = self.__doc.createElement('title')
+		titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8')))
+
+		yearNode = self.__doc.createElement('pub_year')
+		yearNode.appendChild(self.__doc.createTextNode(d['pub_year']))
+
+		countryNode = self.__doc.createElement('country')
+		countryNode.appendChild(self.__doc.createTextNode(d['country']))
+		pubNode = self.__doc.createElement('publisher')
+		pubNode.appendChild(self.__doc.createTextNode(d['publisher']))
+		langNode = self.__doc.createElement('language')
+		langNode.appendChild(self.__doc.createTextNode(d['language']))
+
+		writersNode = self.__doc.createElement('writers')
+		for g in d['writer']:
+			writerNode = self.__doc.createElement('writer')
+			writerNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+			writersNode.appendChild(writerNode)
+
+		genresNode = self.__doc.createElement('genres')
+		for g in d['genre']:
+			genreNode = self.__doc.createElement('genre')
+			genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+			genresNode.appendChild(genreNode)
+
+		commentsNode = self.__doc.createElement('comments')
+		#for g in d['comments']:
+		#	commentsNode.appendChild(self.__doc.createTextNode(unicode("%s\n\n" % g, 'latin-1').encode('utf-8')))
+		commentsData = string.join(d['comments'], '\n\n')
+		commentsNode.appendChild(self.__doc.createTextNode(unicode(commentsData, 'latin-1').encode('utf-8')))
+
+		artistsNode = self.__doc.createElement('artists')
+		for k, v in d['artist'].iteritems():
+			artistNode = self.__doc.createElement('artist')
+			artistNode.appendChild(self.__doc.createTextNode(unicode(v, 'latin-1').encode('utf-8')))
+			artistsNode.appendChild(artistNode)
+
+		pagesNode = self.__doc.createElement('pages')
+		pagesNode.appendChild(self.__doc.createTextNode(d['pages']))
+
+		issueNode = self.__doc.createElement('issue')
+		issueNode.appendChild(self.__doc.createTextNode(d['issue']))
+
+		if d['image']:
+			imageNode = self.__doc.createElement('image')
+			imageNode.setAttribute('format', 'JPEG')
+			imageNode.setAttribute('id', d['image'][0])
+			imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8')))
+
+			coverNode = self.__doc.createElement('cover')
+			coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))
+
+		for name in (	'writersNode', 'genresNode', 'artistsNode', 'pagesNode', 'yearNode', 
+						'titleNode', 'issueNode', 'commentsNode', 'pubNode', 'langNode', 
+						'countryNode' ):
+			entryNode.appendChild(eval(name))
+
+		if d['image']:
+			entryNode.appendChild(coverNode)
+			self.__images.appendChild(imageNode)
+
+		self.__collection.appendChild(entryNode)
+		
+		self.__currentId += 1
+		return entryNode
+
+	def printEntry(self, nEntry):
+		"""
+		Prints entry's XML content to stdout
+		"""
+		try:
+			print nEntry.toxml()
+		except:
+			print sys.stderr, "Error while outputing XML content from entry to Tellico"
+
+	def printXMLTree(self):
+		"""
+		Outputs XML content to stdout
+		"""
+		self.__collection.appendChild(self.__images)
+		print XML_HEADER; print DOCTYPE
+		print self.__root.toxml()
+
+
+class DarkHorseParser:
+	def __init__(self):
+		self.__baseURL 	 = 'http://www.darkhorse.com'
+		self.__basePath  = '/profile/profile.php?sku='
+		self.__searchURL = '/search/search.php?frompage=userinput&sstring=%s&x=0&y=0'
+		self.__coverPath = 'http://images.darkhorse.com/covers/'
+		self.__movieURL  = self.__baseURL + self.__basePath
+
+		# Define some regexps
+		self.__regExps = { 	'title' 		: '<font size="\+2"><b>(?P<title>.*?)</b></font>',
+							'pub_date'		: '<b>Pub.* Date:</b> *<a.*>(?P<pub_date>.*)</a>',
+							'desc'			: '<p>(?P<desc>.*?)<br>',
+							'writer'		: '<b>Writer: *</b> *<a.*?>(?P<writer>.*)</a>',
+							'cover_artist'	: '<b>Cover Artist: *</b> *<a.*>(?P<cover_artist>.*)</a>',
+							'penciller'		: '<b>Penciller: *</b> *<a.*>(?P<penciller>.*)</a>',
+							'inker'			: '<b>Inker: *</b> *<a.*>(?P<inker>.*)</a>',
+							'letterer'		: '<b>Letterer: *</b> *<a.*>(?P<letterer>.*)</a>',
+							'colorist'		: '<b>Colorist: *</b> *<a.*>(?P<colorist>.*)</a>',
+							'genre'			: '<b>Genre: *</b> *<a.*?>(?P<genre>.*?)</a><br>',
+							'format'		: '<b>Format: *</b> *(?P<format>.*?)<br>',
+						}	
+
+		# Compile patterns objects
+		self.__regExpsPO = {}
+		for k, pattern in self.__regExps.iteritems():
+			self.__regExpsPO[k] = re.compile(pattern)
+
+		self.__domTree = BasicTellicoDOM()
+
+	def run(self, title):
+		"""
+		Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
+		to stdout (in tellico format) so that tellico can use it.
+		"""
+		self.__getMovie(title)
+		# Print results to stdout
+		self.__domTree.printXMLTree()
+
+	def __getHTMLContent(self, url):
+		"""
+		Fetch HTML data from url
+		"""
+		u = urllib2.urlopen(url)
+		self.__data = u.read()
+		u.close()
+
+	def __fetchMovieLinks(self):
+		"""
+		Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() 
+		that need to be parsed.
+		"""
+		matchList = re.findall("""<a *href="%s(?P<page>.*?)">(?P<title>.*?)</a>""" % self.__basePath.replace('?', '\?'), self.__data)
+		if not matchList: return None
+			
+		return matchList
+
+	def __fetchCover(self, path, delete = True):
+		"""
+		Fetch cover to /tmp. Returns base64 encoding of data.
+		The image is deleted if delete is True
+		"""
+		md5 = genMD5()
+		imObj = urllib2.urlopen(path.strip())
+		img = imObj.read()
+		imObj.close()
+		imgPath = "/tmp/%s.jpeg" % md5
+		try:
+			f = open(imgPath, 'w')
+			f.write(img)
+			f.close()
+		except:
+			print sys.stderr, "Error: could not write image into /tmp"
+
+		b64data = (md5 + '.jpeg', base64.encodestring(img))
+
+		# Delete temporary image
+		if delete:
+			try:
+				os.remove(imgPath)
+			except:
+				print sys.stderr, "Error: could not delete temporary image /tmp/%s.jpeg" % md5
+
+		return b64data
+
+	def __fetchMovieInfo(self, url):
+		"""
+		Looks for movie information
+		"""
+		self.__getHTMLContent(url)
+
+		# First grab picture data
+		imgMatch = re.search("""<img src="%s(?P<imgpath>.*?)".*>""" % self.__coverPath, self.__data)
+		if imgMatch:
+			imgPath = self.__coverPath + imgMatch.group('imgpath')
+			# Fetch cover and gets its base64 encoded data
+			b64img = self.__fetchCover(imgPath)
+		else:
+			b64img = None
+
+		# Now isolate data between <div class="bodytext">...</div> elements
+		# re.S sets DOTALL; it makes the "." special character match any character at all, including a newline
+		m = re.search("""<div class="bodytext">(?P<part>.*)</div>""", self.__data, re.S)
+		self.__data = m.group('part')
+
+		matches = {}
+		data = {}
+		data['comments'] = []
+		data['artist'] = {}
+
+		# Default values
+		data['publisher'] 	= 'Dark Horse Comics'
+		data['language'] 	= 'English'
+		data['country'] 	= 'USA'
+
+		data['image'] 		= b64img
+		data['pub_year']	= NULLSTRING
+
+		for name, po in self.__regExpsPO.iteritems():
+			data[name] = NULLSTRING
+			if name == 'desc':
+				matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I)
+			else:
+				matches[name] = po.search(self.__data)
+
+			if matches[name]:
+				if name == 'title':
+					title = matches[name].group('title').strip()
+					data[name] = title
+					# Look for issue information
+					m = re.search("#(?P<issue>[0-9]+)", title)
+					if m:
+						data['issue'] = m.group('issue')
+					else:
+						data['issue'] = ''
+
+				elif name == 'pub_date':
+					pub_date = matches[name].group('pub_date').strip()
+					data['pub_year'] = pub_date[-4:]
+					# Add this to comments field
+					data['comments'].insert(0, "Pub. Date: %s" % pub_date)
+
+				elif name == 'desc':
+					# Find biggest size
+					max = 0
+					for i in range(len(matches[name])):
+						if len(matches[name][i]) > len(matches[name][max]):
+							max = i
+					data['comments'].append(matches[name][max].strip())
+
+				elif name == 'writer':
+					# We may find several writers
+					data[name] = []
+					writersList = re.sub('</?a.*?>', '', matches[name].group('writer')).split(',')
+					for d in writersList:
+						data[name].append(d.strip())
+
+				elif name == 'cover_artist':
+					data['artist']['Cover Artist'] = matches[name].group('cover_artist').strip()
+
+				elif name == 'penciller':
+					data['artist']['Penciller'] = matches[name].group('penciller').strip()
+
+				elif name == 'inker':
+					data['artist']['Inker'] = matches[name].group('inker').strip()
+
+				elif name == 'colorist':
+					data['artist']['Colorist'] = matches[name].group('colorist').strip()
+
+				elif name == 'letterer':
+					data['artist']['Letterer'] = matches[name].group('letterer').strip()
+
+				elif name == 'genre':
+					# We may find several genres
+					data[name] = []
+					genresList = re.sub('</?a.*?>', '', matches[name].group('genre')).split(',')
+					for d in genresList:
+						data[name].append(d.strip())
+
+				elif name == 'format':
+					format = matches[name].group('format').strip()
+					data['comments'].insert(1, format)
+					m = re.search("(?P<pages>[0-9]+)", format)
+					if m:
+						data['pages'] = m.group('pages')
+					else:
+						data['pages'] = ''
+
+		return data
+
+
+	def __getMovie(self, title):
+		if not len(title): return
+
+		self.__title = title
+		self.__getHTMLContent("%s%s" % (self.__baseURL, self.__searchURL % urllib.quote(self.__title)))
+
+		# Get all links
+		links = self.__fetchMovieLinks()
+
+		# Now retrieve infos
+		if links:
+			for entry in links:
+				data = self.__fetchMovieInfo( url = self.__movieURL + entry[0] )
+				# Add DC link (custom field)
+				data['darkhorse'] = "%s%s" % (self.__movieURL, entry[0])
+				node = self.__domTree.addEntry(data)
+				# Print entries on-the-fly
+				#self.__domTree.printEntry(node)
+		else:
+			return None
+
+def halt():
+	print "HALT."
+	sys.exit(0)
+
+def showUsage():
+	print "Usage: %s comic" % sys.argv[0]
+	sys.exit(1)
+
+def main():
+	if len(sys.argv) < 2:
+		showUsage()
+
+	parser = DarkHorseParser()
+	parser.run(sys.argv[1])
+
+if __name__ == '__main__':
+	main()
diff --git a/src/fetch/scripts/dark_horse_comics.py.spec b/src/fetch/scripts/dark_horse_comics.py.spec
new file mode 100644
index 0000000..9481dc8
--- /dev/null
+++ b/src/fetch/scripts/dark_horse_comics.py.spec
@@ -0,0 +1,7 @@
+Name=Dark Horse Comics
+Type=data-source
+ArgumentKeys=1
+Arguments=%1
+CollectionType=6
+FormatType=0
+UpdateArgs=%{title}
diff --git a/src/fetch/scripts/fr.allocine.py b/src/fetch/scripts/fr.allocine.py
new file mode 100755
index 0000000..97a2247
--- /dev/null
+++ b/src/fetch/scripts/fr.allocine.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+
+# ***************************************************************************
+#    copyright            : (C) 2006 by Mathias Monnerville
+#    email                : [email protected]
+# ***************************************************************************
+#
+# ***************************************************************************
+# *                                                                         *
+# *   This program is free software; you can redistribute it and/or modify  *
+# *   it under the terms of version 2 of the GNU General Public License as  *
+# *   published by the Free Software Foundation;                            *
+# *                                                                         *
+# ***************************************************************************
+
+# Version 0.4: 2007-08-27
+# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres 
+# could not be retrieved. Fixed bad http request error due to some changes in HTML code.
+#
+# Version 0.3:
+# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed.
+# 
+# Version 0.2:
+# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore.
+# 
+# Version 0.1:
+# * Initial release.
+
+import sys, os, re, md5, random
+import urllib, urllib2, time, base64
+import xml.dom.minidom
+
+XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
+DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
+
+VERSION = "0.4"
+
+def genMD5():
+	obj = md5.new()
+	float = random.random()
+	obj.update(str(float))
+	return obj.hexdigest()
+
+class BasicTellicoDOM:
+	def __init__(self):
+		self.__doc = xml.dom.minidom.Document()
+		self.__root = self.__doc.createElement('tellico')
+		self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
+		self.__root.setAttribute('syntaxVersion', '9')
+		
+		self.__collection = self.__doc.createElement('collection')
+		self.__collection.setAttribute('title', 'My Movies')
+		self.__collection.setAttribute('type', '3')
+		
+		self.__fields = self.__doc.createElement('fields')
+		# Add all default (standard) fields
+		self.__dfltField = self.__doc.createElement('field')
+		self.__dfltField.setAttribute('name', '_default')
+		
+		# Add a custom 'Collection' field
+		self.__customField = self.__doc.createElement('field')
+		self.__customField.setAttribute('name', 'titre-original')
+		self.__customField.setAttribute('title', 'Original Title')
+		self.__customField.setAttribute('flags', '8')
+		self.__customField.setAttribute('category', 'General')
+		self.__customField.setAttribute('format', '1')
+		self.__customField.setAttribute('type', '1')
+		self.__customField.setAttribute('i18n', 'yes')
+		
+		self.__fields.appendChild(self.__dfltField)
+		self.__fields.appendChild(self.__customField)
+		self.__collection.appendChild(self.__fields)
+
+		self.__images = self.__doc.createElement('images')
+
+		self.__root.appendChild(self.__collection)
+		self.__doc.appendChild(self.__root)
+
+		# Current movie id
+		self.__currentId = 0
+
+
+	def addEntry(self, movieData):
+		"""
+		Add a movie entry
+		"""
+		d = movieData
+		entryNode = self.__doc.createElement('entry')
+		entryNode.setAttribute('id', str(self.__currentId))
+
+		titleNode = self.__doc.createElement('title')
+		titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8')))
+
+		otitleNode = self.__doc.createElement('titre-original')
+		otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8')))
+
+		yearNode = self.__doc.createElement('year')
+		yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8')))
+
+		genresNode = self.__doc.createElement('genres')
+		for g in d['genres']:
+			genreNode = self.__doc.createElement('genre')
+			genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+			genresNode.appendChild(genreNode)
+
+		natsNode = self.__doc.createElement('nationalitys')
+		natNode = self.__doc.createElement('nat')
+		natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8')))
+		natsNode.appendChild(natNode)
+
+		castsNode = self.__doc.createElement('casts')
+		for g in d['actors']:
+			castNode = self.__doc.createElement('cast')
+			col1Node = self.__doc.createElement('column')
+			col2Node = self.__doc.createElement('column')
+			col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+			castNode.appendChild(col1Node)
+			castNode.appendChild(col2Node)
+			castsNode.appendChild(castNode)
+
+		dirsNode = self.__doc.createElement('directors')
+		for g in d['dirs']:
+			dirNode = self.__doc.createElement('director')
+			dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+			dirsNode.appendChild(dirNode)
+
+		timeNode = self.__doc.createElement('running-time')
+		timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8')))
+
+		allocineNode = self.__doc.createElement(unicode('allocin�-link', 'latin-1').encode('utf-8'))
+		allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8')))
+
+		plotNode = self.__doc.createElement('plot')
+		plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8')))
+
+		if d['image']:
+			imageNode = self.__doc.createElement('image')
+			imageNode.setAttribute('format', 'JPEG')
+			imageNode.setAttribute('id', d['image'][0])
+			imageNode.setAttribute('width', '120')
+			imageNode.setAttribute('height', '160')
+			imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8')))
+
+			coverNode = self.__doc.createElement('cover')
+			coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))
+
+		for name in (	'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode', 
+						'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ):
+			entryNode.appendChild(eval(name))
+
+		if d['image']:
+			entryNode.appendChild(coverNode)
+			self.__images.appendChild(imageNode)
+
+		self.__collection.appendChild(entryNode)
+		
+		self.__currentId += 1
+
+	def printXML(self):
+		"""
+		Outputs XML content to stdout
+		"""
+		self.__collection.appendChild(self.__images)
+		print XML_HEADER; print DOCTYPE
+		print self.__root.toxml()
+
+
+class AlloCineParser:
+	def __init__(self):
+		self.__baseURL 	= 'http://www.allocine.fr'
+		self.__basePath = '/film/fichefilm_gen_cfilm'
+		self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1'
+		self.__movieURL = self.__baseURL + self.__basePath
+
+		# Define some regexps
+		self.__regExps = { 	'title' 	: '<title>(?P<title>.+?)</title>',
+							'dirs'		: 'R�alis� par <a.*?>(?P<step1>.+?)</a>.*?</h4>',
+							'actors' 	: '<h4>Avec *<a.*?>(?P<step1>.+)</a> &nbsp;',
+							'nat' 		: '<h4>Film *(?P<nat>.+?)[,\.]',
+							'genres' 	: '<h4>Genre *: *<a.*?>(?P<step1>.+?)</a></h4>',
+							'time' 		: '<h4>Dur�e *: *(?P<hours>[0-9])?h *(?P<mins>[0-9]{1,2})min',
+							'year' 		: 'Ann�e de production *: *(?P<year>[0-9]{4})',
+							# Original movie title
+							'otitle' 	: 'Titre original *: *<i>(?P<otitle>.+?)</i>',
+							'plot'		: """(?s)<td valign="top" style="padding:10 0 0 0"><div align="justify"><h4> *(?P<plot>.+?) *</h4>""",
+							'image'		: """<td valign="top" width="120".*?<img src="(?P<image>.+?)" border"""}
+							
+
+		self.__domTree = BasicTellicoDOM()
+
+	def run(self, title):
+		"""
+		Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
+		to stdout (in tellico format) so that tellico can use it.
+		"""
+		self.__getMovie(title)
+		# Print results to stdout
+		self.__domTree.printXML()
+
+	def __getHTMLContent(self, url):
+		"""
+		Fetch HTML data from url
+		"""
+
+		u = urllib2.urlopen(url)
+		self.__data = u.read()
+		u.close()
+
+	def __fetchMovieLinks(self):
+		"""
+		Retrieve all links related to movie
+		"""
+		matchList = re.findall("""<h4><a *href="%s=(?P<page>.*?\.html?)" *class="link1">(?P<title>.*?)</a>""" % self.__basePath, self.__data)
+		if not matchList: return None
+
+		return matchList
+
+	def __fetchMovieInfo(self, url):
+		"""
+		Looks for movie information
+		"""
+		self.__getHTMLContent(url)
+
+		matches = data = {}
+
+		for name, regexp in self.__regExps.iteritems():
+			if name == 'image':
+				matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I)
+			else:
+				matches[name] = re.search(regexp, self.__data)
+
+			if matches[name]:
+				if name == 'title':
+					data[name] = matches[name].group('title').strip()
+				elif name == 'dirs':
+					dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+					data[name] = []
+					for d in dirsList:
+						data[name].append(d.strip())
+
+				elif name == 'actors':
+					actorsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+					data[name] = []
+					for d in actorsList:
+						data[name].append(d.strip())
+
+				elif name == 'nat':
+					data[name] = matches[name].group('nat').strip()
+
+				elif name == 'genres':
+					genresList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+					data[name] = []
+					for d in genresList:
+						data[name].append(d.strip())
+
+				elif name == 'time':
+					h, m = matches[name].group('hours'), matches[name].group('mins')
+					totmin = int(h)*60+int(m)
+					data[name] = str(totmin)
+
+				elif name == 'year':
+					data[name] = matches[name].group('year').strip()
+
+				elif name == 'otitle':
+					data[name] = matches[name].group('otitle').strip()
+
+				elif name == 'plot':
+					data[name] = matches[name].group('plot').strip()
+
+				# Image path
+				elif name == 'image':
+					# Save image to a temporary folder
+					md5 = genMD5()
+					imObj = urllib2.urlopen(matches[name][0].strip())
+					img = imObj.read()
+					imObj.close()
+					imgPath = "/tmp/%s.jpeg" % md5
+					try:
+						f = open(imgPath, 'w')
+						f.write(img)
+						f.close()
+					except:
+						# Could be great if we can pass exit code and some message
+						# to tellico in case of failure...
+						pass
+
+					data[name] = (md5 + '.jpeg', base64.encodestring(img))
+					# Delete temporary image
+					try:
+						os.remove(imgPath)
+					except:
+						# Could be great if we can pass exit code and some msg
+						# to tellico in case of failure...
+						pass
+			else:
+				matches[name] = ''
+
+		return data
+
+
+	def __getMovie(self, title):
+		if not len(title): return
+
+		self.__title = title
+		self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title))
+
+		# Get all links
+		links = self.__fetchMovieLinks()
+
+		# Now retrieve infos
+		if links:
+			for entry in links:
+				data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]) )
+				# Add allocine link (custom field)
+				data['allocine'] = "%s=%s" % (self.__movieURL, entry[0])
+				self.__domTree.addEntry(data)
+		else:
+			return None
+
+
+
+def showUsage():
+	print "Usage: %s movietitle" % sys.argv[0]
+	sys.exit(1)
+
+def main():
+	if len(sys.argv) < 2:
+		showUsage()
+
+	parser = AlloCineParser()
+	parser.run(sys.argv[1])
+
+if __name__ == '__main__':
+	main()
diff --git a/src/fetch/scripts/fr.allocine.py.spec b/src/fetch/scripts/fr.allocine.py.spec
new file mode 100644
index 0000000..773b951
--- /dev/null
+++ b/src/fetch/scripts/fr.allocine.py.spec
@@ -0,0 +1,7 @@
+Name=Allocine.fr
+Type=data-source
+ArgumentKeys=1
+Arguments=%1
+CollectionType=3
+FormatType=0
+UpdateArgs=%{title}
diff --git a/src/fetch/scripts/ministerio_de_cultura.py b/src/fetch/scripts/ministerio_de_cultura.py
new file mode 100644
index 0000000..8a768f9
--- /dev/null
+++ b/src/fetch/scripts/ministerio_de_cultura.py
@@ -0,0 +1,595 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+
+# ***************************************************************************
+#    copyright            : (C) 2006-2008 by Mathias Monnerville
+#    email                : [email protected]
+# ***************************************************************************
+#
+# ***************************************************************************
+# *                                                                         *
+# *   This program is free software; you can redistribute it and/or modify  *
+# *   it under the terms of version 2 of the GNU General Public License as  *
+# *   published by the Free Software Foundation;                            *
+# *                                                                         *
+# ***************************************************************************
+
+# $Id: books_ministerio_de_cultura.py 428 2007-03-07 13:17:17Z mathias $
+
+"""
+This script has to be used with tellico (http://periapsis.org/tellico) as an external data source program.
+It allows searching for books in Spanish Ministry of Culture's database (at http://www.mcu.es/bases/spa/isbn/ISBN.html).
+
+Multiple ISBN/UPC searching is supported through the -m option:
+	./books_ministerio_de_cultura.py -m filename
+where filename holds one ISBN or UPC per line.
+
+Tellico data source setup:
+- Source type: External Application
+- Source name: Ministerio de Cultura (ES) (or whatever you want :)
+- Collection type: Book Collection
+- Result type: Tellico
+- Path: /path/to/script/books_ministerio_de_cultura.py
+- Arguments:
+Title (checked) 	= -t %1
+Person (checked) 	= -a %1
+ISBN (checked)  	= -i %1
+UPC (checked)		= -i %1
+Update (checked) = %{title}
+
+** Please note that this script is also part of the Tellico's distribution. 
+** You will always find the latest version in the SVN trunk of Tellico
+
+SVN Version:	
+	* Removes translators for Authors List
+	* Adds translators to translator field
+	* Change from "Collection" to "Series"
+	* Process "Series Number"
+	* Adds in comments "ed.lit." authors
+	* If there isn't connection to Spanish Ministry of Culture
+	  shows a nice error message (timeout: 5 seconds)
+	* Removed "translated from/to" from Comments field as already
+	  exists in "Publishing" field
+	* Removed "Collection" field as I moved to Series/Series Number
+
+Version 0.3.2:
+	* Now find 'notas' field related information
+	* search URL modified to fetch information of exhausted books too
+
+Version 0.3.1:
+Bug Fixes:
+	* The 'tr.' string does not appear among authors anymore
+	* Fixed an AttributeError exception related to a regexp matching the number of pages
+
+Version 0.3:
+Bug Fixes:
+	* URL of the search engine has changed:
+		http://www.mcu.es/bases/spa/isbn/ISBN.html is now http://www.mcu.es/comun/bases/isbn/ISBN.html
+	* All the regexps have been rewritten to match the new site's content
+
+Version 0.2:
+New features:
+	* Support for multiple ISBN/UPC searching (support from command line with -m option)
+	* Default books collection enhanced with a new custom field 'Collection'
+	* Search extended for both available and exhausted books
+	* Hyphens are stripped out in the ISBN (or UPC) search
+
+Bug Fixes:
+	* Publication year now holds only the year
+	* ISBN regexp fix
+	* Fix for publisher field (values were inverted)
+	* -i parameter works for both ISBN and UPC based search
+
+Version 0.1:
+	* Initial Release
+"""
+
+import sys, os, re, md5, random, string
+import urllib, urllib2, time, base64
+import xml.dom.minidom, types
+import socket
+
+XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
+DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
+NULLSTRING = ''
+
+VERSION = "0.3.2"
+
+ISBN, AUTHOR, TITLE = range(3)
+
+TRANSLATOR_STR = "tr."
+EDLIT_STR = "ed. lit."
+
+class EngineError(Exception): pass
+
+class BasicTellicoDOM:
+	"""
+	This class manages tellico's XML data model (DOM)
+	"""
+	def __init__(self):
+		self.__doc = xml.dom.minidom.Document()
+		self.__root = self.__doc.createElement('tellico')
+		self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
+		self.__root.setAttribute('syntaxVersion', '9')
+		
+		self.__collection = self.__doc.createElement('collection')
+		self.__collection.setAttribute('title', 'My Books')
+		self.__collection.setAttribute('type', '2')
+
+		self.__fields = self.__doc.createElement('fields')                                                                  
+		# Add all default (standard) fields
+		self.__dfltField = self.__doc.createElement('field')                                                                   
+		self.__dfltField.setAttribute('name', '_default')                                                                      
+		
+		# Add a custom 'Collection' field (Left by reference for
+		# the future)
+		#self.__customCollectionField = self.__doc.createElement('field')
+		#self.__customCollectionField.setAttribute('name', 'book_collection')
+		#self.__customCollectionField.setAttribute('title', 'Collection')
+		#self.__customCollectionField.setAttribute('flags', '7')
+		#self.__customCollectionField.setAttribute('category', 'Classification')
+		#self.__customCollectionField.setAttribute('format', '0')
+		#self.__customCollectionField.setAttribute('type', '1')
+		#self.__customCollectionField.setAttribute('i18n', 'yes')
+
+
+		self.__fields.appendChild(self.__dfltField)
+		#self.__fields.appendChild(self.__customCollectionField)
+		self.__collection.appendChild(self.__fields)
+
+		self.__root.appendChild(self.__collection)
+		self.__doc.appendChild(self.__root)
+
+		# Current movie id. See entry's id attribute in self.addEntry()
+		self.__currentId = 0
+
+
+	def addEntry(self, movieData):
+		"""
+		Add a comic entry. 
+		Returns an entry node instance
+		"""
+
+		d = movieData
+
+		# Convert all strings to UTF-8
+		for i in d.keys():
+			if type(d[i]) == types.ListType:
+				d[i] = [unicode(d[i][j], 'latin-1').encode('utf-8') for j in range(len(d[i]))]
+			elif type(d[i]) == types.StringType:
+				d[i] = unicode(d[i], 'latin-1').encode('utf-8')
+
+		entryNode = self.__doc.createElement('entry')
+		entryNode.setAttribute('id', str(self.__currentId))
+
+		titleNode = self.__doc.createElement('title')
+		titleNode.appendChild(self.__doc.createTextNode(d['title']))
+
+		yearNode = self.__doc.createElement('pub_year')
+		yearNode.appendChild(self.__doc.createTextNode(d['pub_year']))
+
+		pubNode = self.__doc.createElement('publisher')
+		pubNode.appendChild(self.__doc.createTextNode(d['publisher']))
+
+		langsNode = self.__doc.createElement('languages')
+		for l in d['language']:
+			langNode = self.__doc.createElement('language')
+			langNode.appendChild(self.__doc.createTextNode(l))
+			langsNode.appendChild(langNode)
+
+		keywordsNode = self.__doc.createElement('keywords')
+		keywordNode = self.__doc.createElement('keyword')
+		keywordNode.appendChild(self.__doc.createTextNode(d['keyword']))
+		keywordsNode.appendChild(keywordNode)
+
+		edNode = self.__doc.createElement('edition')
+		edNode.appendChild(self.__doc.createTextNode(d['edition']))
+
+		writersNode = self.__doc.createElement('authors')
+		for g in d['author']:
+			writerNode = self.__doc.createElement('author')
+			writerNode.appendChild(self.__doc.createTextNode(g))
+			writersNode.appendChild(writerNode)
+
+		commentsNode = self.__doc.createElement('comments')
+		commentsData = string.join(d['comments'], '<br/>')
+		commentsNode.appendChild(self.__doc.createTextNode(commentsData))
+
+		pagesNode = self.__doc.createElement('pages')
+		pagesNode.appendChild(self.__doc.createTextNode(d['pages']))
+
+		isbnNode = self.__doc.createElement('isbn')
+		isbnNode.appendChild(self.__doc.createTextNode(d['isbn']))
+
+		priceNode = self.__doc.createElement('pur_price')
+		priceNode.appendChild(self.__doc.createTextNode(d['pur_price']))
+
+		seriesNode = self.__doc.createElement('series')
+		seriesNode.appendChild(self.__doc.createTextNode(d['series']))
+
+		seriesNumNode = self.__doc.createElement('series_num')
+		seriesNumNode.appendChild(self.__doc.createTextNode(d['series_num']))
+
+		translatorNode = self.__doc.createElement('translator')
+		translatorNode.appendChild(self.__doc.createTextNode(d['translator']))
+
+		for name in ( 'title', 'year', 'pub', 'langs', 'keyword', 'ed', 'writers', 
+			'comments', 'pages', 'isbn', 'price', 'series', 'seriesNum', 'translator' ):
+			entryNode.appendChild(eval(name + 'Node'))
+
+		self.__collection.appendChild(entryNode)
+		self.__currentId += 1
+
+		return entryNode
+
+	def printEntry(self, nEntry):
+		"""
+		Prints entry's XML content to stdout
+		"""
+
+		try:
+			print nEntry.toxml()
+		except:
+			print sys.stderr, "Error while outputing XML content from entry to Tellico"
+
+	def printXMLTree(self):
+		"""
+		Outputs XML content to stdout
+		"""
+
+		print XML_HEADER; print DOCTYPE
+		print self.__root.toxml()
+
+
+class MinisterioCulturaParser:
+	def __init__(self):
+		# Search form is at http://www.mcu.es/comun/bases/isbn/ISBN.html
+		self.__baseURL	 = 'http://www.mcu.es'
+		self.__searchURL = '/cgi-brs/BasesHTML/isbn/BRSCGI?CMD=VERLST&BASE=ISBN&DOCS=1-15&CONF=AEISPA.cnf&OPDEF=AND&SEPARADOR=' + \
+						   '&WDIS-C=DISPONIBLE+or+AGOTADO&WGEN-C=&WISB-C=%s&WAUT-C=%s&WTIT-C=%s&WMAT-C=&WEDI-C=&'
+
+		self.__suffixURL = 'WFEP-C=&%40T353-GE=&%40T353-LE=&WSER-C=&WLUG-C=&WLEN-C=&WCLA-C=&WSOP-C='
+
+		# Define some regexps
+		self.__regExps = { 	'author'		: '<th scope="row">Autor:.*?<td>(?P<author>.*?)</td>',
+							'isbn'			: '<span class="cabTitulo">ISBN.*?<strong>(?P<isbn>.*?)</strong>',	# Matches ISBN 13
+							'title'			: '<th scope="row">T&iacute;tulo:.*?<td>(?P<title>.*?)</td>',
+							'language'		: '<th scope="row">Lengua:.*?<td>(?P<language>.*?)</td>',
+							'edition'		: '<th scope="row">Edici&oacute;n:.*?<td>.*?<span>(?P<edition>.*?)</span>',
+							'pur_price'		: '<th scope="row">Precio:.*?<td>.*?<span>(?P<pur_price>.*?)&euro;</span>',
+							'desc'			: '<th scope="row">Descripci&oacute;n:.*?<td>.*?<span>(?P<desc>.*?)</span>',
+							'publication'	: '<th scope="row">Publicaci&oacute;n:.*?<td>.*?<span>(?P<publication>.*?)</span>',
+							'keyword'		: '<th scope="row">Materias:.*?<td>.*?<span>(?P<keywords>.*?)</span>',
+							'notas'			: '<th scope="row">Notas:.*?<td>.*?<span>(?P<notas>.*?)</span>',
+							'cdu'			: '<th scope="row">CDU:.*?<td><span>(?P<cdu>.*?)</span></td>',
+							'encuadernacion': '<th scope="row">Encuadernaci&oacute;n:.*?<td>.*?<span>(?P<encuadernacion>.*?)</span>',
+							'series'	: '<th scope="row">Colecci&oacute;n:.*?<td>.*?<span>(?P<series>.*?)</span>'
+						}	
+
+		# Compile patterns objects
+		self.__regExpsPO = {}
+		for k, pattern in self.__regExps.iteritems():
+			self.__regExpsPO[k] = re.compile(pattern)
+
+		self.__domTree = BasicTellicoDOM()
+
+	def run(self, criteria, kind):
+		"""
+		Runs the parser: fetch book related links, then fills and prints the DOM tree
+		to stdout (in tellico format) so that tellico can use it.
+		"""
+
+		# Strip out hyphens if kind is ISBN
+		if kind == ISBN:
+			criteria = criteria.replace('-', NULLSTRING)
+			# Support for multiple search
+			isbnList = criteria.split(';')
+			for n in isbnList:
+				self.__getBook(n, kind)
+		else:
+			self.__getBook(criteria, kind)
+
+		# Print results to stdout
+		self.__domTree.printXMLTree()
+
+	def __getHTMLContent(self, url):
+		"""
+		Fetch HTML data from url
+		"""
+		
+		try:
+		    u = urllib2.urlopen(url)
+		except Exception, e:
+			u.close()
+			sys.exit("""
+Network error while getting HTML content.
+Tellico cannot connect to: http://www.mcu.es/comun/bases/isbn/ISBN.htm webpage:
+'%s'""" % e)
+
+
+		self.__data = u.read()
+		u.close()
+
+	def __fetchBookLinks(self):
+		"""
+		Retrieve all links related to the search. self.__data contains HTML content fetched by self.__getHTMLContent() 
+		that need to be parsed.
+		"""
+
+		matchList = re.findall("""<div class="isbnResDescripcion">.*?<p>.*?<A target="_top" HREF="(?P<url>.*?)">""", self.__data, re.S)
+
+		if not matchList: return None
+		return matchList
+
+	def __fetchBookInfo(self, url):
+		"""
+		Looks for book information
+		"""
+
+		self.__getHTMLContent(url)
+
+		matches = {}
+		data = {}
+
+		data['comments'] = []
+		# Empty string if series not available
+		data['series_num'] = NULLSTRING 
+		data['translator'] = NULLSTRING
+
+		for name, po in self.__regExpsPO.iteritems():
+			data[name] = NULLSTRING
+			matches[name] = re.search(self.__regExps[name], self.__data, re.S | re.I)
+
+
+			if matches[name]:
+				if name == 'title':
+					d = matches[name].group('title').strip()
+					d = re.sub('<.?strong>', NULLSTRING, d)
+					d = re.sub('\n', NULLSTRING, d)
+					data['title'] = d
+
+				elif name == 'isbn':
+					data['isbn'] = matches[name].group('isbn').strip()
+
+				elif name == 'edition':
+					data['edition'] = matches[name].group('edition').strip()
+
+				elif name == 'pur_price':
+					d = matches[name].group('pur_price')
+					data['pur_price'] = d.strip() + ' EUR'
+
+				elif name == 'publication':
+					d = matches[name].group('publication')
+					for p in ('</?[Aa].*?>', '&nbsp;', ':', ','):
+						d = re.sub(p, NULLSTRING, d)
+
+					d = d.split('\n')
+					# d[1] is an empty string
+					data['publisher'] = "%s (%s)" % (d[2], d[0])
+					data['pub_year'] = re.sub('\d{2}\/', NULLSTRING, d[3])
+					del data['publication']
+
+				elif name == 'desc':
+					d = matches[name].group('desc')
+					m = re.search('\d+ ', d)
+					# When not available
+					data['pages'] = NULLSTRING
+					if m:
+						data['pages'] = m.group(0).strip()
+					m = re.search('; (?P<format>.*cm)', d)
+					if m:
+						data['comments'].append('Format: ' + m.group('format').strip())
+					del data['desc']
+
+				elif name == 'encuadernacion':
+					data['comments'].append(matches[name].group('encuadernacion').strip())
+
+				elif name == 'keyword':
+					d = matches[name].group('keywords')
+					d = re.sub('</?[Aa].*?>', NULLSTRING, d)
+					data['keyword'] = d.strip()
+
+				elif name == 'cdu':
+					data['comments'].append('CDU: ' + matches[name].group('cdu').strip())
+				
+				elif name == 'notas':
+					data['comments'].append(matches[name].group('notas').strip())
+				
+				elif name == 'series':
+					d = matches[name].group('series').strip()
+					d = re.sub('&nbsp;', ' ', d)
+					data[name] = d
+					# data[name] can contain something like 'Byblos, 162/24'
+
+					# Maybe better to add the reg exp to get seriesNum in self.__regExps 
+					p = re.compile('[0-9]+$')
+					s = re.search(p, data[name])
+
+					if s:
+						# if series ends with a number, it seems that is a 
+						# number of the book inside the series. We save in seriesNum
+						data['series_num'] = s.group()
+
+						# it removes lasts digits (plus one because is space or /) from
+						# data['series']
+						l = len(data['series_num']) + 1
+						data[name] = data[name][0:-l]
+						data[name] = data[name].rstrip(",") # remove the , between series and series_num
+
+				elif name == 'author':
+					# We may find several authors
+					data[name] = []
+					authorsList = re.findall('<a.*?>(?P<author>.*?)</a>', matches[name].group('author'), re.S | re.I)
+					if not authorsList:
+						# No href links
+						authors = re.search('<li>(?P<author>.*?)</li>', matches[name].group('author'), re.S | re.I)
+						try:
+							results = authors.group('author').strip().split(',')
+						except AttributeError:
+							results = []
+						results = [r.strip() for r in results]
+						data[name] = results
+					else:
+						for d in authorsList:
+							# Sometimes, the search engine outputs some image between a elements
+							if d.strip()[:4] != '<img':
+								data[name].append(d.strip())
+					
+					# Move tr authors (translators) to translators list
+					translator = self.__getSpecialRol(data[name], TRANSLATOR_STR)
+					edlit = self.__getSpecialRol(data[name], EDLIT_STR)
+					data[name] = self.__removeSpecialsFromAuthors(data[name], translator, TRANSLATOR_STR)
+					data[name] = self.__removeSpecialsFromAuthors(data[name], edlit, EDLIT_STR)
+
+					if len(translator) > 0:
+						data['translator'] = self.__formatSpecials(translator, NULLSTRING)
+
+					if len(edlit) > 0:
+						data['comments'].append(self.__formatSpecials(edlit, "Editor Literario: "))
+
+				elif name == 'language':
+					# We may find several languages
+					d =  matches[name].group('language')
+					d = re.sub('\n', NULLSTRING, d)
+					d = d.split('<span>')
+					a = []
+					for lg in d:
+						if len(lg):
+							lg = re.sub('</span>', NULLSTRING, lg)
+							# Because HTML is not interpreted in the 'language' field of Tellico
+							lg = re.sub('&oacute;', 'o', lg)
+							a.append(lg.strip())
+					# Removes that word so that only the language name remains.
+					a[0] = re.sub('publicacion: ', NULLSTRING, a[0])
+					data['language'] = a
+					# Add other language related info to the 'comments' field too
+					#for lg in a[1:]:
+						#data['comments'].append(lg)
+
+		return data
+
+
+	def __getBook(self, data, kind = ISBN):
+		if not len(data): 
+			raise EngineError, "No data given. Unable to proceed."
+
+		if kind == ISBN:
+			self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \
+				(urllib.quote(data),		# ISBN
+				 NULLSTRING,				# AUTHOR
+				 NULLSTRING),				# TITLE
+				 self.__suffixURL)
+				)
+		elif kind == AUTHOR:
+			self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \
+				(NULLSTRING,				# ISBN
+				 urllib.quote(data),		# AUTHOR
+				 NULLSTRING),				# TITLE
+				 self.__suffixURL)
+				)
+
+		elif kind == TITLE:
+			self.__getHTMLContent("%s%s%s" % (self.__baseURL, self.__searchURL % \
+				(NULLSTRING,				# ISBN
+				 NULLSTRING,				# AUTHOR
+				 urllib.quote(data)),		# TITLE
+				 self.__suffixURL)
+				)
+
+		# Get all links
+		links = self.__fetchBookLinks()
+
+		# Now retrieve infos
+		if links:
+			for entry in links:
+				data = self.__fetchBookInfo( url = self.__baseURL + entry.replace(' ', '%20') )
+				node = self.__domTree.addEntry(data)
+		else:
+			return None
+
+	def __getSpecialRol(self, authors, special):
+		"""
+		Receives a list like ['Stephen King','Lorenzo Cortina','tr.',
+		'Rosal�a V�zquez','tr.'] and returns a list with special names
+		"""
+
+		j = 0; max = len(authors)
+		special_rol = []
+		while j < max:
+			if authors[j] == special:
+				special_rol.append(authors[j-1])
+			j += 1
+		
+		return special_rol
+
+	def __removeSpecialsFromAuthors(self, authors, specials, string):
+		"""
+		Receives a list with authors+translators and removes 'tr.' and 
+		authors from there. Example:
+		authors: ['Stephen King','Lorenzo Cortina','tr.','Rosal�a V�zquez','tr.']
+		translators: ['Lorenzo Cortina','Rosal�a V�zquez']
+		returns: ['Stephen King']
+
+		(We could also guess string value because is the next position
+		in authors list)
+		"""
+
+		newauthors = authors[:]
+
+		for t in specials:
+			newauthors.remove(t)
+			newauthors.remove(string)
+
+		return newauthors
+
+	def __formatSpecials(self, translators, prefix):
+		"""
+		Receives a list with translators and returns a string
+		(authors are handled different: each author in a different node)
+		"""
+
+		return prefix + string.join(translators, '; ')
+
+def halt():
+	print "HALT."
+	sys.exit(0)
+
+def showUsage():
+	print """Usage: %s options
+Where options are:
+  -t  title
+  -i  (ISBN|UPC)
+  -a  author
+  -m  filename   (support for multiple ISBN/UPC search)""" % sys.argv[0]
+	sys.exit(1)
+
+def main():
+	if len(sys.argv) < 3:
+		showUsage()
+
+	socket.setdefaulttimeout(5)
+
+	# ;-separated ISBNs string
+	isbnStringList = NULLSTRING
+
+	opts = {'-t' : TITLE, '-i' : ISBN, '-a' : AUTHOR, '-m' : isbnStringList}
+	if sys.argv[1] not in opts.keys():
+		showUsage()
+
+	if sys.argv[1] == '-m':
+		try:
+			f = open(sys.argv[2], 'r')
+			data = f.readlines()
+			# remove trailing \n
+			sys.argv[2] = string.join([d[:-1] for d in data], ';')
+			sys.argv[1] = '-i'
+			f.close()
+		except IOError, e:
+			print "Error: %s" % e
+			sys.exit(1)
+
+	parser = MinisterioCulturaParser()
+	parser.run(sys.argv[2], opts[sys.argv[1]])
+
+if __name__ == '__main__':
+	main()
diff --git a/src/fetch/scripts/ministerio_de_cultura.py.spec b/src/fetch/scripts/ministerio_de_cultura.py.spec
new file mode 100644
index 0000000..ef24ac5
--- /dev/null
+++ b/src/fetch/scripts/ministerio_de_cultura.py.spec
@@ -0,0 +1,7 @@
+Name=Spanish Ministry of Culture
+Type=data-source
+ArgumentKeys=1,2,3,4
+Arguments=-t %1,-a %1,-i %1,-i %1
+CollectionType=2
+FormatType=0
+UpdateArgs=-t %{title}
author	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-03-01 19:17:32 +0000
committer	tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2010-03-01 19:17:32 +0000
commit	e38d2351b83fa65c66ccde443777647ef5cb6cff (patch)
tree	1897fc20e9f73a81c520a5b9f76f8ed042124883 /src/fetch/scripts
download	tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.tar.gz tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.zip