summaryrefslogtreecommitdiffstats
path: root/src/fetch/scripts/fr.allocine.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/fetch/scripts/fr.allocine.py')
-rwxr-xr-xsrc/fetch/scripts/fr.allocine.py335
1 files changed, 335 insertions, 0 deletions
diff --git a/src/fetch/scripts/fr.allocine.py b/src/fetch/scripts/fr.allocine.py
new file mode 100755
index 0000000..97a2247
--- /dev/null
+++ b/src/fetch/scripts/fr.allocine.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python
+# -*- coding: iso-8859-1 -*-
+
+# ***************************************************************************
+# copyright : (C) 2006 by Mathias Monnerville
+# ***************************************************************************
+#
+# ***************************************************************************
+# * *
+# * This program is free software; you can redistribute it and/or modify *
+# * it under the terms of version 2 of the GNU General Public License as *
+# * published by the Free Software Foundation; *
+# * *
+# ***************************************************************************
+
+# Version 0.4: 2007-08-27
+# * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres
+# could not be retrieved. Fixed bad http request error due to some changes in HTML code.
+#
+# Version 0.3:
+# * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed.
+#
+# Version 0.2:
+# * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore.
+#
+# Version 0.1:
+# * Initial release.
+
+import sys, os, re, md5, random
+import urllib, urllib2, time, base64
+import xml.dom.minidom
+
+XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
+DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
+
+VERSION = "0.4"
+
+def genMD5():
+ obj = md5.new()
+ float = random.random()
+ obj.update(str(float))
+ return obj.hexdigest()
+
+class BasicTellicoDOM:
+ def __init__(self):
+ self.__doc = xml.dom.minidom.Document()
+ self.__root = self.__doc.createElement('tellico')
+ self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
+ self.__root.setAttribute('syntaxVersion', '9')
+
+ self.__collection = self.__doc.createElement('collection')
+ self.__collection.setAttribute('title', 'My Movies')
+ self.__collection.setAttribute('type', '3')
+
+ self.__fields = self.__doc.createElement('fields')
+ # Add all default (standard) fields
+ self.__dfltField = self.__doc.createElement('field')
+ self.__dfltField.setAttribute('name', '_default')
+
+ # Add a custom 'Collection' field
+ self.__customField = self.__doc.createElement('field')
+ self.__customField.setAttribute('name', 'titre-original')
+ self.__customField.setAttribute('title', 'Original Title')
+ self.__customField.setAttribute('flags', '8')
+ self.__customField.setAttribute('category', 'General')
+ self.__customField.setAttribute('format', '1')
+ self.__customField.setAttribute('type', '1')
+ self.__customField.setAttribute('i18n', 'yes')
+
+ self.__fields.appendChild(self.__dfltField)
+ self.__fields.appendChild(self.__customField)
+ self.__collection.appendChild(self.__fields)
+
+ self.__images = self.__doc.createElement('images')
+
+ self.__root.appendChild(self.__collection)
+ self.__doc.appendChild(self.__root)
+
+ # Current movie id
+ self.__currentId = 0
+
+
+ def addEntry(self, movieData):
+ """
+ Add a movie entry
+ """
+ d = movieData
+ entryNode = self.__doc.createElement('entry')
+ entryNode.setAttribute('id', str(self.__currentId))
+
+ titleNode = self.__doc.createElement('title')
+ titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8')))
+
+ otitleNode = self.__doc.createElement('titre-original')
+ otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8')))
+
+ yearNode = self.__doc.createElement('year')
+ yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8')))
+
+ genresNode = self.__doc.createElement('genres')
+ for g in d['genres']:
+ genreNode = self.__doc.createElement('genre')
+ genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+ genresNode.appendChild(genreNode)
+
+ natsNode = self.__doc.createElement('nationalitys')
+ natNode = self.__doc.createElement('nat')
+ natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8')))
+ natsNode.appendChild(natNode)
+
+ castsNode = self.__doc.createElement('casts')
+ for g in d['actors']:
+ castNode = self.__doc.createElement('cast')
+ col1Node = self.__doc.createElement('column')
+ col2Node = self.__doc.createElement('column')
+ col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+ castNode.appendChild(col1Node)
+ castNode.appendChild(col2Node)
+ castsNode.appendChild(castNode)
+
+ dirsNode = self.__doc.createElement('directors')
+ for g in d['dirs']:
+ dirNode = self.__doc.createElement('director')
+ dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8')))
+ dirsNode.appendChild(dirNode)
+
+ timeNode = self.__doc.createElement('running-time')
+ timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8')))
+
+ allocineNode = self.__doc.createElement(unicode('allocin�-link', 'latin-1').encode('utf-8'))
+ allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8')))
+
+ plotNode = self.__doc.createElement('plot')
+ plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8')))
+
+ if d['image']:
+ imageNode = self.__doc.createElement('image')
+ imageNode.setAttribute('format', 'JPEG')
+ imageNode.setAttribute('id', d['image'][0])
+ imageNode.setAttribute('width', '120')
+ imageNode.setAttribute('height', '160')
+ imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8')))
+
+ coverNode = self.__doc.createElement('cover')
+ coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))
+
+ for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode',
+ 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ):
+ entryNode.appendChild(eval(name))
+
+ if d['image']:
+ entryNode.appendChild(coverNode)
+ self.__images.appendChild(imageNode)
+
+ self.__collection.appendChild(entryNode)
+
+ self.__currentId += 1
+
+ def printXML(self):
+ """
+ Outputs XML content to stdout
+ """
+ self.__collection.appendChild(self.__images)
+ print XML_HEADER; print DOCTYPE
+ print self.__root.toxml()
+
+
+class AlloCineParser:
+ def __init__(self):
+ self.__baseURL = 'http://www.allocine.fr'
+ self.__basePath = '/film/fichefilm_gen_cfilm'
+ self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1'
+ self.__movieURL = self.__baseURL + self.__basePath
+
+ # Define some regexps
+ self.__regExps = { 'title' : '<title>(?P<title>.+?)</title>',
+ 'dirs' : 'R�alis� par <a.*?>(?P<step1>.+?)</a>.*?</h4>',
+ 'actors' : '<h4>Avec *<a.*?>(?P<step1>.+)</a> &nbsp;',
+ 'nat' : '<h4>Film *(?P<nat>.+?)[,\.]',
+ 'genres' : '<h4>Genre *: *<a.*?>(?P<step1>.+?)</a></h4>',
+ 'time' : '<h4>Dur�e *: *(?P<hours>[0-9])?h *(?P<mins>[0-9]{1,2})min',
+ 'year' : 'Ann�e de production *: *(?P<year>[0-9]{4})',
+ # Original movie title
+ 'otitle' : 'Titre original *: *<i>(?P<otitle>.+?)</i>',
+ 'plot' : """(?s)<td valign="top" style="padding:10 0 0 0"><div align="justify"><h4> *(?P<plot>.+?) *</h4>""",
+ 'image' : """<td valign="top" width="120".*?<img src="(?P<image>.+?)" border"""}
+
+
+ self.__domTree = BasicTellicoDOM()
+
+ def run(self, title):
+ """
+ Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
+ to stdout (in tellico format) so that tellico can use it.
+ """
+ self.__getMovie(title)
+ # Print results to stdout
+ self.__domTree.printXML()
+
+ def __getHTMLContent(self, url):
+ """
+ Fetch HTML data from url
+ """
+
+ u = urllib2.urlopen(url)
+ self.__data = u.read()
+ u.close()
+
+ def __fetchMovieLinks(self):
+ """
+ Retrieve all links related to movie
+ """
+ matchList = re.findall("""<h4><a *href="%s=(?P<page>.*?\.html?)" *class="link1">(?P<title>.*?)</a>""" % self.__basePath, self.__data)
+ if not matchList: return None
+
+ return matchList
+
+ def __fetchMovieInfo(self, url):
+ """
+ Looks for movie information
+ """
+ self.__getHTMLContent(url)
+
+ matches = data = {}
+
+ for name, regexp in self.__regExps.iteritems():
+ if name == 'image':
+ matches[name] = re.findall(self.__regExps[name], self.__data, re.S | re.I)
+ else:
+ matches[name] = re.search(regexp, self.__data)
+
+ if matches[name]:
+ if name == 'title':
+ data[name] = matches[name].group('title').strip()
+ elif name == 'dirs':
+ dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+ data[name] = []
+ for d in dirsList:
+ data[name].append(d.strip())
+
+ elif name == 'actors':
+ actorsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+ data[name] = []
+ for d in actorsList:
+ data[name].append(d.strip())
+
+ elif name == 'nat':
+ data[name] = matches[name].group('nat').strip()
+
+ elif name == 'genres':
+ genresList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
+ data[name] = []
+ for d in genresList:
+ data[name].append(d.strip())
+
+ elif name == 'time':
+ h, m = matches[name].group('hours'), matches[name].group('mins')
+ totmin = int(h)*60+int(m)
+ data[name] = str(totmin)
+
+ elif name == 'year':
+ data[name] = matches[name].group('year').strip()
+
+ elif name == 'otitle':
+ data[name] = matches[name].group('otitle').strip()
+
+ elif name == 'plot':
+ data[name] = matches[name].group('plot').strip()
+
+ # Image path
+ elif name == 'image':
+ # Save image to a temporary folder
+ md5 = genMD5()
+ imObj = urllib2.urlopen(matches[name][0].strip())
+ img = imObj.read()
+ imObj.close()
+ imgPath = "/tmp/%s.jpeg" % md5
+ try:
+ f = open(imgPath, 'w')
+ f.write(img)
+ f.close()
+ except:
+ # Could be great if we can pass exit code and some message
+ # to tellico in case of failure...
+ pass
+
+ data[name] = (md5 + '.jpeg', base64.encodestring(img))
+ # Delete temporary image
+ try:
+ os.remove(imgPath)
+ except:
+ # Could be great if we can pass exit code and some msg
+ # to tellico in case of failure...
+ pass
+ else:
+ matches[name] = ''
+
+ return data
+
+
+ def __getMovie(self, title):
+ if not len(title): return
+
+ self.__title = title
+ self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title))
+
+ # Get all links
+ links = self.__fetchMovieLinks()
+
+ # Now retrieve infos
+ if links:
+ for entry in links:
+ data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]) )
+ # Add allocine link (custom field)
+ data['allocine'] = "%s=%s" % (self.__movieURL, entry[0])
+ self.__domTree.addEntry(data)
+ else:
+ return None
+
+
+
+def showUsage():
+ print "Usage: %s movietitle" % sys.argv[0]
+ sys.exit(1)
+
+def main():
+ if len(sys.argv) < 2:
+ showUsage()
+
+ parser = AlloCineParser()
+ parser.run(sys.argv[1])
+
+if __name__ == '__main__':
+ main()