X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=0fc39163ee2a74131ed84442c68f388a72fad0f1;hb=f36cd076850faf4b2859a168fcb740dfccb9eed6;hp=c9c563599ea1782b313fbb69dcd60a998fbe4583;hpb=d77c3dfd027e9af4d44fc7109fac0012451268c2;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index c9c563599..0fc39163e 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -12,29 +12,15 @@ import time import urllib import urllib2 import email.utils +import xml.etree.ElementTree +from urlparse import parse_qs try: import cStringIO as StringIO except ImportError: import StringIO -# parse_qs was moved from the cgi module to the urlparse module recently. -try: - from urlparse import parse_qs -except ImportError: - from cgi import parse_qs - -try: - import lxml.etree -except ImportError: - pass # Handled below - -try: - import xml.etree.ElementTree -except ImportError: # Python<2.5: Not officially supported, but let it slip - warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') - -from Utils import * +from utils import * class InfoExtractor(object): @@ -53,7 +39,6 @@ class InfoExtractor(object): url: Final video URL. uploader: Nickname of the video uploader. title: Literal title. - stitle: Simplified title. ext: Video filename extension. format: Video format. player_url: SWF Player URL (may be None). @@ -117,8 +102,8 @@ class YoutubeIE(InfoExtractor): _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -129,6 +114,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '44': 'webm', '45': 'webm', + '46': 'webm', } _video_dimensions = { '5': '240x400', @@ -144,6 +130,7 @@ class YoutubeIE(InfoExtractor): '43': '360x640', '44': '480x854', '45': '720x1280', + '46': '1080x1920', } IE_NAME = u'youtube' @@ -193,8 +180,8 @@ class YoutubeIE(InfoExtractor): end = start + float(dur) start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + caption = unescapeHTML(caption) + caption = unescapeHTML(caption) # double cycle, inentional srt += str(n) + '\n' srt += start + ' --> ' + end + '\n' srt += caption + '\n\n' @@ -339,10 +326,6 @@ class YoutubeIE(InfoExtractor): return video_title = urllib.unquote_plus(video_info['title'][0]) video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - # simplified title - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -364,49 +347,39 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # closed captions video_subtitles = None if self._downloader.params.get('writesubtitles', False): - self.report_video_subtitles_download(video_id) - request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) - if srt_lang_list: - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = srt_lang_list[0] - if not srt_lang in srt_lang_list: - self._downloader.trouble(u'WARNING: no closed captions found in the specified language') - else: - request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) - try: - srt_xml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: - video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + if not srt_lang_list: + raise Trouble(u'WARNING: video has no closed captions') + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' else: - self._downloader.trouble(u'WARNING: video has no closed captions') + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + raise Trouble(u'WARNING: no closed captions found in the specified language') + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + except Trouble as trouble: + self._downloader.trouble(trouble[0]) # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -458,31 +431,25 @@ class YoutubeIE(InfoExtractor): self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') return + results = [] for format_param, video_real_url in video_url_list: - # At this point we have a new video - self._downloader.increment_downloads() - # Extension video_extension = self._video_extensions.get(format_param, 'flv') - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_real_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': upload_date, - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles - }) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + results.append({ + 'id': video_id.decode('utf-8'), + 'url': video_real_url.decode('utf-8'), + 'uploader': video_uploader.decode('utf-8'), + 'upload_date': upload_date, + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'thumbnail': video_thumbnail.decode('utf-8'), + 'description': video_description, + 'player_url': player_url, + 'subtitles': video_subtitles + }) + return results class MetacafeIE(InfoExtractor): @@ -491,12 +458,10 @@ class MetacafeIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - _youtube_ie = None IE_NAME = u'metacafe' - def __init__(self, youtube_ie, downloader=None): + def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - self._youtube_ie = youtube_ie def report_disclaimer(self): """Report disclaimer retrieval.""" @@ -549,14 +514,9 @@ class MetacafeIE(InfoExtractor): # Check if video comes from YouTube mobj2 = re.match(r'^yt-(.*)$', video_id) if mobj2 is not None: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1)) + self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) return - # At this point we have a new video - self._downloader.increment_downloads() - - simple_title = mobj.group(2).decode('utf-8') - # Retrieve video webpage to extract further information request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) try: @@ -602,7 +562,6 @@ class MetacafeIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) if mobj is None: @@ -610,21 +569,16 @@ class MetacafeIE(InfoExtractor): return video_uploader = mobj.group(1) - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: unable to download video') + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader.decode('utf-8'), + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class DailymotionIE(InfoExtractor): @@ -651,8 +605,6 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - # At this point we have a new video - self._downloader.increment_downloads() video_id = mobj.group(1) video_extension = 'flv' @@ -689,8 +641,6 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: @@ -698,21 +648,16 @@ class DailymotionIE(InfoExtractor): return video_uploader = mobj.group(1) - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: unable to download video') + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader.decode('utf-8'), + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class GoogleIE(InfoExtractor): @@ -739,8 +684,6 @@ class GoogleIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - # At this point we have a new video - self._downloader.increment_downloads() video_id = mobj.group(1) video_extension = 'mp4' @@ -774,8 +717,6 @@ class GoogleIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -802,21 +743,16 @@ class GoogleIE(InfoExtractor): else: # we need something to pass to process_info video_thumbnail = '' - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': u'NA', - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: unable to download video') + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': u'NA', + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class PhotobucketIE(InfoExtractor): @@ -843,8 +779,6 @@ class PhotobucketIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - # At this point we have a new video - self._downloader.increment_downloads() video_id = mobj.group(1) video_extension = 'flv' @@ -873,26 +807,19 @@ class PhotobucketIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) video_uploader = mobj.group(2).decode('utf-8') - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: unable to download video') + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader, + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class YahooIE(InfoExtractor): @@ -922,8 +849,6 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - # At this point we have a new video - self._downloader.increment_downloads() video_id = mobj.group(2) video_extension = 'flv' @@ -968,7 +893,6 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -1026,25 +950,20 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) - - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description, - 'thumbnail': video_thumbnail, - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: unable to download video') + video_url = unescapeHTML(video_url) + + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'thumbnail': video_thumbnail.decode('utf-8'), + 'description': video_description, + 'thumbnail': video_thumbnail, + 'player_url': None, + }] class VimeoIE(InfoExtractor): @@ -1072,8 +991,6 @@ class VimeoIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return - # At this point we have a new video - self._downloader.increment_downloads() video_id = mobj.group(1) # Retrieve video webpage to extract further information @@ -1100,7 +1017,6 @@ class VimeoIE(InfoExtractor): # Extract title video_title = config["video"]["title"] - simple_title = simplify_title(video_title) # Extract uploader video_uploader = config["video"]["owner"]["name"] @@ -1109,18 +1025,9 @@ class VimeoIE(InfoExtractor): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # Extract upload date video_upload_date = u'NA' @@ -1149,22 +1056,17 @@ class VimeoIE(InfoExtractor): video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ %(video_id, sig, timestamp, quality, video_codec.upper()) - try: - # Process video information - self._downloader.process_info({ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'ERROR: unable to download video') + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'title': video_title, + 'ext': video_extension, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'player_url': None, + }] class GenericIE(InfoExtractor): @@ -1202,16 +1104,16 @@ class GenericIE(InfoExtractor): """ def redirect_request(self, req, fp, code, msg, headers, newurl): if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) + newurl = newurl.replace(' ', '%20') + newheaders = dict((k,v) for k,v in req.headers.items() + if k.lower() not in ("content-length", "content-type")) + return HeadRequest(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) else: - raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) - + raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) + class HTTPMethodFallback(urllib2.BaseHandler): """ Fallback to GET if HEAD is not allowed (405 HTTP error) @@ -1221,17 +1123,17 @@ class GenericIE(InfoExtractor): fp.close() newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) + if k.lower() not in ("content-length", "content-type")) return self.parent.open(urllib2.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True)) # Build our opener opener = urllib2.OpenerDirector() for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]: + HTTPMethodFallback, HEADRedirectHandler, + urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]: opener.add_handler(handler()) response = opener.open(HeadRequest(url)) @@ -1245,9 +1147,6 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): if self._test_redirect(url): return - - # At this point we have a new video - self._downloader.increment_downloads() video_id = url.split('/')[-1] request = urllib2.Request(url) @@ -1297,8 +1196,6 @@ class GenericIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) @@ -1307,43 +1204,33 @@ class GenericIE(InfoExtractor): return video_uploader = mobj.group(1).decode('utf-8') - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader, + 'upload_date': u'NA', + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class YoutubeSearchIE(InfoExtractor): """Information Extractor for YouTube search queries.""" _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' - _youtube_ie = None _max_youtube_results = 1000 IE_NAME = u'youtube:search' - def __init__(self, youtube_ie, downloader=None): + def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - self._youtube_ie = youtube_ie def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) - def _real_initialize(self): - self._youtube_ie.initialize() - def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: @@ -1401,7 +1288,7 @@ class YoutubeSearchIE(InfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) + self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) return @@ -1411,22 +1298,17 @@ class GoogleSearchIE(InfoExtractor): _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' _VIDEO_INDICATOR = r'\s*Next\s*' - _youtube_ie = None IE_NAME = u'youtube:playlist' - def __init__(self, youtube_ie, downloader=None): + def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - self._youtube_ie = youtube_ie def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) - def _real_initialize(self): - self._youtube_ie.initialize() - def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url) @@ -1611,7 +1483,7 @@ class YoutubePlaylistIE(InfoExtractor): # Single video case if mobj.group(3) is not None: - self._youtube_ie.extract(mobj.group(3)) + self._downloader.download([mobj.group(3)]) return # Download playlist pages @@ -1655,7 +1527,7 @@ class YoutubePlaylistIE(InfoExtractor): video_ids = video_ids[playliststart:playlistend] for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) + self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) return @@ -1667,21 +1539,16 @@ class YoutubeUserIE(InfoExtractor): _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' - _youtube_ie = None IE_NAME = u'youtube:user' - def __init__(self, youtube_ie, downloader=None): + def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - self._youtube_ie = youtube_ie def report_download_page(self, username, start_index): """Report attempt to download user page.""" self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % (username, start_index, start_index + self._GDATA_PAGE_SIZE)) - def _real_initialize(self): - self._youtube_ie.initialize() - def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -1744,7 +1611,7 @@ class YoutubeUserIE(InfoExtractor): (username, all_ids_count, len(video_ids))) for video_id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id) + self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) class DepositFilesIE(InfoExtractor): @@ -1765,9 +1632,6 @@ class DepositFilesIE(InfoExtractor): self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) def _real_extract(self, url): - # At this point we have a new file - self._downloader.increment_downloads() - file_id = url.split('/')[-1] # Rebuild url in english locale url = 'http://depositfiles.com/en/files/' + file_id @@ -1804,21 +1668,16 @@ class DepositFilesIE(InfoExtractor): return file_title = mobj.group(1).decode('utf-8') - try: - # Process file information - self._downloader.process_info({ - 'id': file_id.decode('utf-8'), - 'url': file_url.decode('utf-8'), - 'uploader': u'NA', - 'upload_date': u'NA', - 'title': file_title, - 'stitle': file_title, - 'ext': file_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError, err: - self._downloader.trouble(u'ERROR: unable to download file') + return [{ + 'id': file_id.decode('utf-8'), + 'url': file_url.decode('utf-8'), + 'uploader': u'NA', + 'upload_date': u'NA', + 'title': file_title, + 'ext': file_extension.decode('utf-8'), + 'format': u'NA', + 'player_url': None, + }] class FacebookIE(InfoExtractor): @@ -1959,9 +1818,6 @@ class FacebookIE(InfoExtractor): return video_title = video_info['title'] video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail' not in video_info: @@ -2011,31 +1867,24 @@ class FacebookIE(InfoExtractor): return video_url_list = [(req_format, url_map[req_format])] # Specific format + results = [] for format_param, video_real_url in video_url_list: - - # At this point we have a new video - self._downloader.increment_downloads() - # Extension video_extension = self._video_extensions.get(format_param, 'mp4') - try: - # Process video information - self._downloader.process_info({ - 'id': video_id.decode('utf-8'), - 'url': video_real_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': upload_date, - 'title': video_title, - 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': video_thumbnail.decode('utf-8'), - 'description': video_description.decode('utf-8'), - 'player_url': None, - }) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + results.append({ + 'id': video_id.decode('utf-8'), + 'url': video_real_url.decode('utf-8'), + 'uploader': video_uploader.decode('utf-8'), + 'upload_date': upload_date, + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'thumbnail': video_thumbnail.decode('utf-8'), + 'description': video_description.decode('utf-8'), + 'player_url': None, + }) + return results class BlipTVIE(InfoExtractor): """Information extractor for blip.tv""" @@ -2078,7 +1927,6 @@ class BlipTVIE(InfoExtractor): 'id': title, 'url': url, 'title': title, - 'stitle': simplify_title(title), 'ext': ext, 'urlhandle': urlh } @@ -2098,21 +1946,20 @@ class BlipTVIE(InfoExtractor): data = json_data['Post'] else: data = json_data - + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') video_url = data['media']['url'] umobj = re.match(self._URL_EXT, video_url) if umobj is None: raise ValueError('Can not determine filename extension') ext = umobj.group(1) - + info = { 'id': data['item_id'], 'url': video_url, 'uploader': data['display_name'], 'upload_date': upload_date, 'title': data['title'], - 'stitle': simplify_title(data['title']), 'ext': ext, 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], @@ -2123,12 +1970,7 @@ class BlipTVIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err)) return - self._downloader.increment_downloads() - - try: - self._downloader.process_info(info) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + return [info] class MyVideoIE(InfoExtractor): @@ -2179,24 +2021,17 @@ class MyVideoIE(InfoExtractor): return video_title = mobj.group(1) - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) - - try: - self._downloader.process_info({ - 'id': video_id, - 'url': video_url, - 'uploader': u'NA', - 'upload_date': u'NA', - 'title': video_title, - 'stitle': simple_title, - 'ext': u'flv', - 'format': u'NA', - 'player_url': None, - }) - except UnavailableVideoError: - self._downloader.trouble(u'\nERROR: Unable to download video') + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': u'NA', + 'upload_date': u'NA', + 'title': video_title, + 'ext': u'flv', + 'format': u'NA', + 'player_url': None, + }] class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2206,7 +2041,7 @@ class ComedyCentralIE(InfoExtractor): def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) - + def report_config_download(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id) @@ -2278,6 +2113,8 @@ class ComedyCentralIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err)) return + results = [] + idoc = xml.etree.ElementTree.fromstring(indexXml) itemEls = idoc.findall('.//item') for itemEl in itemEls: @@ -2310,8 +2147,6 @@ class ComedyCentralIE(InfoExtractor): # For now, just pick the highest bitrate format,video_url = turls[-1] - self._downloader.increment_downloads() - effTitle = showId + u'-' + epTitle info = { 'id': shortMediaId, @@ -2319,7 +2154,6 @@ class ComedyCentralIE(InfoExtractor): 'uploader': showId, 'upload_date': officialDate, 'title': effTitle, - 'stitle': simplify_title(effTitle), 'ext': 'mp4', 'format': format, 'thumbnail': None, @@ -2327,11 +2161,9 @@ class ComedyCentralIE(InfoExtractor): 'player_url': playerUrl } - try: - self._downloader.process_info(info) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download ' + mediaId) - continue + results.append(info) + + return results class EscapistIE(InfoExtractor): @@ -2347,8 +2179,6 @@ class EscapistIE(InfoExtractor): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -2358,17 +2188,18 @@ class EscapistIE(InfoExtractor): self.report_extraction(showName) try: - webPage = urllib2.urlopen(url).read() + webPageBytes = urllib2.urlopen(url).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err)) return + webPage = webPageBytes.decode('utf-8') descMatch = re.search('([^<]+)', coursepage) if m: @@ -2946,13 +2728,13 @@ class StanfordOpenClassroomIE(InfoExtractor): 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), } for vpage in links] - + results = [] for entry in info['list']: assert entry['type'] == 'reference' - self.extract(entry['url']) + results += self.extract(entry['url']) + return results + else: # Root page - unescapeHTML = HTMLParser.HTMLParser().unescape - info = { 'id': 'Stanford OpenClassroom', 'type': 'playlist', @@ -2967,7 +2749,6 @@ class StanfordOpenClassroomIE(InfoExtractor): return info['title'] = info['id'] - info['stitle'] = simplify_title(info['title']) links = orderedSet(re.findall('', rootpage)) info['list'] = [ @@ -2977,9 +2758,11 @@ class StanfordOpenClassroomIE(InfoExtractor): } for cpage in links] + results = [] for entry in info['list']: assert entry['type'] == 'reference' - self.extract(entry['url']) + results += self.extract(entry['url']) + return results class MTVIE(InfoExtractor): """Information extractor for MTV.com""" @@ -3059,18 +2842,13 @@ class MTVIE(InfoExtractor): self._downloader.trouble('Invalid rendition field.') return - self._downloader.increment_downloads() info = { 'id': video_id, 'url': video_url, 'uploader': performer, 'title': video_title, - 'stitle': simplify_title(video_title), 'ext': ext, 'format': format, } - try: - self._downloader.process_info(info) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download ' + video_id) + return [info]