#!/usr/bin/env python # -*- coding: iso-8859-1 -*- # *************************************************************************** # copyright : (C) 2006 by Mathias Monnerville # email : tellico@monnerville.com # *************************************************************************** # # *************************************************************************** # * * # * This program is free software; you can redistribute it and/or modify * # * it under the terms of version 2 of the GNU General Public License as * # * published by the Free Software Foundation; * # * * # *************************************************************************** # Version 0.4: 2007-08-27 # * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres # could not be retrieved. Fixed bad http request error due to some changes in HTML code. # # Version 0.3: # * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed. # # Version 0.2: # * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore. # # Version 0.1: # * Initial release. import sys, os, re, md5, random import urllib, urllib2, time, base64 import xml.dom.minidom XML_HEADER = """""" DOCTYPE = """""" VERSION = "0.4" def genMD5(): obj = md5.new() float = random.random() obj.update(str(float)) return obj.hexdigest() class BasicTellicoDOM: def __init__(self): self.__doc = xml.dom.minidom.Document() self.__root = self.__doc.createElement('tellico') self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') self.__root.setAttribute('syntaxVersion', '9') self.__collection = self.__doc.createElement('collection') self.__collection.setAttribute('title', 'My Movies') self.__collection.setAttribute('type', '3') self.__fields = self.__doc.createElement('fields') # Add all default (standard) fields self.__dfltField = self.__doc.createElement('field') self.__dfltField.setAttribute('name', '_default') # Add a custom 'Collection' field self.__customField = self.__doc.createElement('field') self.__customField.setAttribute('name', 'titre-original') self.__customField.setAttribute('title', 'Original Title') self.__customField.setAttribute('flags', '8') self.__customField.setAttribute('category', 'General') self.__customField.setAttribute('format', '1') self.__customField.setAttribute('type', '1') self.__customField.setAttribute('i18n', 'yes') self.__fields.appendChild(self.__dfltField) self.__fields.appendChild(self.__customField) self.__collection.appendChild(self.__fields) self.__images = self.__doc.createElement('images') self.__root.appendChild(self.__collection) self.__doc.appendChild(self.__root) # Current movie id self.__currentId = 0 def addEntry(self, movieData): """ Add a movie entry """ d = movieData entryNode = self.__doc.createElement('entry') entryNode.setAttribute('id', str(self.__currentId)) titleNode = self.__doc.createElement('title') titleNode.appendChild(self.__doc.createTextNode(unicode(d['title'], 'latin-1').encode('utf-8'))) otitleNode = self.__doc.createElement('titre-original') otitleNode.appendChild(self.__doc.createTextNode(unicode(d['otitle'], 'latin-1').encode('utf-8'))) yearNode = self.__doc.createElement('year') yearNode.appendChild(self.__doc.createTextNode(unicode(d['year'], 'latin-1').encode('utf-8'))) genresNode = self.__doc.createElement('genres') for g in d['genres']: genreNode = self.__doc.createElement('genre') genreNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) genresNode.appendChild(genreNode) natsNode = self.__doc.createElement('nationalitys') natNode = self.__doc.createElement('nat') natNode.appendChild(self.__doc.createTextNode(unicode(d['nat'], 'latin-1').encode('utf-8'))) natsNode.appendChild(natNode) castsNode = self.__doc.createElement('casts') for g in d['actors']: castNode = self.__doc.createElement('cast') col1Node = self.__doc.createElement('column') col2Node = self.__doc.createElement('column') col1Node.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) castNode.appendChild(col1Node) castNode.appendChild(col2Node) castsNode.appendChild(castNode) dirsNode = self.__doc.createElement('directors') for g in d['dirs']: dirNode = self.__doc.createElement('director') dirNode.appendChild(self.__doc.createTextNode(unicode(g, 'latin-1').encode('utf-8'))) dirsNode.appendChild(dirNode) timeNode = self.__doc.createElement('running-time') timeNode.appendChild(self.__doc.createTextNode(unicode(d['time'], 'latin-1').encode('utf-8'))) allocineNode = self.__doc.createElement(unicode('allociné-link', 'latin-1').encode('utf-8')) allocineNode.appendChild(self.__doc.createTextNode(unicode(d['allocine'], 'latin-1').encode('utf-8'))) plotNode = self.__doc.createElement('plot') plotNode.appendChild(self.__doc.createTextNode(unicode(d['plot'], 'latin-1').encode('utf-8'))) if d['image']: imageNode = self.__doc.createElement('image') imageNode.setAttribute('format', 'JPEG') imageNode.setAttribute('id', d['image'][0]) imageNode.setAttribute('width', '120') imageNode.setAttribute('height', '160') imageNode.appendChild(self.__doc.createTextNode(unicode(d['image'][1], 'latin-1').encode('utf-8'))) coverNode = self.__doc.createElement('cover') coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'natsNode', 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode' ): entryNode.appendChild(eval(name)) if d['image']: entryNode.appendChild(coverNode) self.__images.appendChild(imageNode) self.__collection.appendChild(entryNode) self.__currentId += 1 def printXML(self): """ Outputs XML content to stdout """ self.__collection.appendChild(self.__images) print XML_HEADER; print DOCTYPE print self.__root.toxml() class AlloCineParser: def __init__(self): self.__baseURL = 'http://www.allocine.fr' self.__basePath = '/film/fichefilm_gen_cfilm' self.__searchURL= 'http://www.allocine.fr/recherche/?motcle=%s&f=3&rub=1' self.__movieURL = self.__baseURL + self.__basePath # Define some regexps self.__regExps = { 'title' : '