# -*- coding: utf-8 -*- # parser.py # Version : 0. # This file is part of Qarte+7 # # Author : Vincent Vande Vyvre # Copyright: 2009-2010 Adrien Beudin beudbeud@gmail.com # Copyright: 2011 Vincent Vande Vyvre # Licence: GPL3 # Home page : https://code.launchpad.net/~vincent-vandevyvre/+junk/Qarte+7 # import os, re import time import urllib2, xml.dom.minidom import sys import BeautifulSoup as BS def unescape_html(text): return BS.BeautifulStoneSoup(text, convertEntities=BS.BeautifulStoneSoup.HTML_ENTITIES).contents[0] time_re = re.compile("^\d\d[h:]\d\d$") monthes = [u"ja", u"f", u"mar", u"av", u"mai", u"juin", u"juil", u"ao", u"se", u"oc", u"no", u"d", u"Ja", u"F", u"Mä", u"Ap", u"Mai", u"Jun", u"Jul", u"Au", u"Se", u"Ok", u"No", u"De"] def parse_date( date_str ): date_array = date_str.split(",") if time_re.search(date_array[-1].strip()) is None: return "" time_ = date_array[-1].strip() if date_array[0].strip() in (u"Aujourd'hui", u"Heute"): date_ = time.strftime("%Y %m %d") elif date_array[0].strip() in (u"Hier", u"Gestern"): date_ = time.strftime("%Y %m %d", time.localtime(time.time() - (24*60*60))) else: array = date_array[1].split() day = array[0].strip(".") month = array[1] for idx, arr in enumerate(monthes): if month.startswith(arr): if idx > 11: idx -= 12 month = "%02d" % (idx+1) break year = array[2] date_ = "%s %s %s" % (year, month, day) return date_ + ", " + time_ class Parser(object): # Constantes ARTE_WEB_ROOT = 'http://videos.arte.tv' INDEX_TAG = 'index' TITLE_TAG = 'bigTitle' DATE_TAG = 'startDate' URL_TAG = 'targetURL' MMS_TAG = 'mmsURL' RESUME_TAG = 'resume' IMAGE_TAG = 'previewPictureURL' # Contenu du catalogue videos = [] # Expressions régulières utilisée pour la recherche dans les pages web xmlRE = re.compile('xmlURL", "(.*\.xml)"') wmvRE = re.compile('availableFormats.*=.*"(.*HQ.*wmv.*)"') mmsRE = re.compile('"(mms.*)"') resumeRE = re.compile('

([^<]*)<') def __init__(self, l): lang = "/%s/" % l max_video_displayed = 200 self.error = False # Stage 1 base_page_url = self.ARTE_WEB_ROOT + lang + "videos/" #we first load the page in order to get the page url #with the correct index html_content = urllib2.urlopen(base_page_url).read() count, soup = 0, False while 1: try: soup = BS.BeautifulSoup(html_content) except Exception, why: self.error = "Error in stage 1 : BeautifulSoup can't read the "\ "page\n %s" % why # Bad formatted line, remove it count += 1 print "%s error(s) : %s" % (count, why) if count == 30: # 30 bad formatted lines!, Abort soup = False break if "malformed" in str(why): l = str(why).split(",") nl = l[-2].split(" ")[-1] htl = html_content.split("\n") html_content = html_content.replace(htl[int(nl) - 1], "") else: soup = False break else: break if soup: # Stage 2 #print "Stage 2" self.error = False try: found_url = 0 for j in soup.findAll('script'): #we will look for the script in the page that has the url #with the correct index for text in j: if "videowallSettings" in text: #when the script is found, we will collect the url for word in text.split(): if "asThumbnail" in word: #there are 4 different urls, we want the one #that displays thumbnails base_page_url = self.ARTE_WEB_ROOT + \ word.replace('"','') + "?hash=" + \ lang.replace('/','') + "/thumb///1/"\ + str(max_video_displayed) + "/" found_url = 1 break if found_url: break if found_url: break except Exception, why: self.error = "Error stage 2, URL not found : %s" % why else: # Stage 3 try: html_content = urllib2.urlopen(base_page_url).read() soup = BS.BeautifulSoup(html_content) for i in soup.findAll('div', {"class":"video"}): try: video = dict() for h in i.findAll('h2'): for a in h.findAll('a'): video['targetURL'] = self.ARTE_WEB_ROOT + a['href'] video['targetURL'] = video['targetURL'].replace("/fr/", lang) video['bigTitle'] = unescape_html( a.string) for p in i.findAll('p'): if 'class' in p: if p['class'] == 'teaserText': video['summary'] = p.string else: if p.string != "" and not p.string.endswith("vues") \ and not p.string.endswith("Aufrufe"): video['startDate'] = parse_date( p.string ) #get thumbnail image for t in i.findAll( 'img', {"class":"thumbnail"}): video['previewPictureURL'] = self.ARTE_WEB_ROOT + t['src'] video['previewPictureURL'] = video['previewPictureURL']\ .replace("/fr/", lang) self.videos.append(video) except: continue except Exception, why: self.error = "Error in stage 3 : %s" % why """ with open('datas.txt', 'w') as dat: for idx, video in enumerate(self.videos): print type(video['bigTitle']) txt = "%s %s %s\n" % (idx, video['bigTitle'], video['startDate']) txt = txt + u"\tThumbnail: %s\n" % video['previewPictureURL'] txt = txt + u"\tVideo: %s\n\n" % video['targetURL'] dat.write(txt.encode('utf-8', 'ignore')) """