X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=cea30dad81fa4224a848732159aa19684c7d5dbc;hb=33d94a6c999ae784be7529aaaea42adadeab0c27;hp=82459e7a80868467eb15d3a2f840d297666495c0;hpb=2ebc6e6a92ae7dd7cdc281f3cf402a031ba6aa03;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 82459e7a8..cea30dad8 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -13,6 +13,8 @@ import urllib import urllib2 import email.utils import xml.etree.ElementTree +import random +import math from urlparse import parse_qs try: @@ -95,7 +97,26 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r"""^ + ( + (?:https?://)? # http(s):// (optional) + (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) + v= + ) + )? # optional -> youtube.com/xxxx is OK + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' @@ -134,6 +155,10 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' + def suitable(self, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(self._VALID_URL, url, re.VERBOSE) is not None + def report_lang(self): """Report attempt to set language.""" self._downloader.to_screen(u'[youtube] Setting language') @@ -188,9 +213,9 @@ class YoutubeIE(InfoExtractor): return srt def _print_formats(self, formats): - print 'Available formats:' + print('Available formats:') for x in formats: - print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) + print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) def _real_initialize(self): if self._downloader is None: @@ -213,7 +238,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError), err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) return # Set language @@ -222,7 +247,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -245,7 +270,7 @@ class YoutubeIE(InfoExtractor): self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') return except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) return # Confirm age @@ -258,7 +283,7 @@ class YoutubeIE(InfoExtractor): self.report_age_confirmation() age_results = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): @@ -268,7 +293,7 @@ class YoutubeIE(InfoExtractor): url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') # Extract video id from URL - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return @@ -280,7 +305,7 @@ class YoutubeIE(InfoExtractor): try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) return # Attempt to extract SWF player URL @@ -302,7 +327,7 @@ class YoutubeIE(InfoExtractor): if 'token' in video_info: break except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) return if 'token' not in video_info: if 'reason' in video_info: @@ -365,7 +390,7 @@ class YoutubeIE(InfoExtractor): try: srt_list = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) if not srt_lang_list: @@ -382,13 +407,19 @@ class YoutubeIE(InfoExtractor): try: srt_xml = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) if not srt_xml: raise Trouble(u'WARNING: unable to download video subtitles') video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) except Trouble as trouble: self._downloader.trouble(trouble[0]) + if 'length_seconds' not in video_info: + self._downloader.trouble(u'WARNING: unable to extract video duration') + video_duration = '' + else: + video_duration = urllib.unquote_plus(video_info['length_seconds'][0]) + # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -402,7 +433,7 @@ class YoutubeIE(InfoExtractor): url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') url_data = [parse_qs(uds) for uds in url_data_strs] url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) - url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) + url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) format_limit = self._downloader.params.get('format_limit', None) available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats @@ -455,7 +486,8 @@ class YoutubeIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, - 'subtitles': video_subtitles + 'subtitles': video_subtitles, + 'duration': video_duration }) return results @@ -494,7 +526,7 @@ class MetacafeIE(InfoExtractor): self.report_disclaimer() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) return # Confirm age @@ -507,7 +539,7 @@ class MetacafeIE(InfoExtractor): self.report_age_confirmation() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): @@ -531,7 +563,7 @@ class MetacafeIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage @@ -571,7 +603,7 @@ class MetacafeIE(InfoExtractor): return video_title = mobj.group(1).decode('utf-8') - mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) + mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract uploader nickname') return @@ -592,7 +624,7 @@ class MetacafeIE(InfoExtractor): class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' def __init__(self, downloader=None): @@ -613,9 +645,9 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group(1).split('_')[0].split('?')[0] - video_extension = 'flv' + video_extension = 'mp4' # Retrieve video webpage to extract further information request = urllib2.Request(url) @@ -624,25 +656,34 @@ class DailymotionIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) + mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - sequence = urllib.unquote(mobj.group(1)) - mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + flashvars = urllib.unquote(mobj.group(1)) + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self._downloader.to_screen(u'[dailymotion] Using %s' % key) + break + else: + self._downloader.trouble(u'ERROR: unable to extract video URL') + return + + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.trouble(u'ERROR: unable to extract video URL') return - mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') - # if needed add http://www.dailymotion.com/ if relative URL + video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/') - video_url = mediaURL + # TODO: support choosing qualities mobj = re.search(r'', webpage) if mobj is None: @@ -650,17 +691,28 @@ class DailymotionIE(InfoExtractor): return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) + video_uploader = u'NA' + mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) + # lookin for official user + mobj_official = re.search(r'', webpage) + if mobj_official is None: + self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + else: + video_uploader = mobj_official.group(1) + else: + video_uploader = mobj.group(1) + + video_upload_date = u'NA' + mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', @@ -702,7 +754,7 @@ class GoogleIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -741,7 +793,7 @@ class GoogleIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'', webpage) if mobj is None: @@ -797,7 +849,7 @@ class PhotobucketIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage @@ -867,7 +919,7 @@ class YahooIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) @@ -891,7 +943,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract uploader and title from webpage @@ -949,7 +1001,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract media URL from playlist XML @@ -978,7 +1030,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' + _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)' IE_NAME = u'vimeo' def __init__(self, downloader=None): @@ -1007,7 +1059,7 @@ class VimeoIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) return # Now we begin extracting as much information as we can from what we @@ -1048,21 +1100,32 @@ class VimeoIE(InfoExtractor): timestamp = config['request']['timestamp'] # Vimeo specific: extract video codec and quality information + # First consider quality, then codecs, then take everything # TODO bind to format param codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] - for codec in codecs: - if codec[0] in config["video"]["files"]: - video_codec = codec[0] - video_extension = codec[1] - if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' - else: quality = 'sd' + files = { 'hd': [], 'sd': [], 'other': []} + for codec_name, codec_extension in codecs: + if codec_name in config["video"]["files"]: + if 'hd' in config["video"]["files"][codec_name]: + files['hd'].append((codec_name, codec_extension, 'hd')) + elif 'sd' in config["video"]["files"][codec_name]: + files['sd'].append((codec_name, codec_extension, 'sd')) + else: + files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0])) + + for quality in ('hd', 'sd', 'other'): + if len(files[quality]) > 0: + video_quality = files[quality][0][2] + video_codec = files[quality][0][0] + video_extension = files[quality][0][1] + self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) break else: self._downloader.trouble(u'ERROR: no known codec found') return video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, quality, video_codec.upper()) + %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, @@ -1078,158 +1141,140 @@ class VimeoIE(InfoExtractor): class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def __init__(self, downloader=None): - InfoExtractor.__init__(self, downloader) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) - - def fetch_webpage(self, url): - self._downloader.increment_downloads() - request = urllib2.Request(url) - try: - self.report_download_webpage(url) - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) - return - except ValueError, err: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - return webpage - - def grep_webpage(self, url, regex, regexFlags, matchTuples): - page = self.fetch_webpage(url) - mobj = re.search(regex, page, regexFlags) - info = {} - - if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) - return - - for (i, key, err) in matchTuples: - if mobj.group(i) is None: - self._downloader.trouble(err) - return - else: - info[key] = mobj.group(i) - - return info - - def extractLiveStream(self, url): - - video_lang = url.split('/')[-4] - - info = self.grep_webpage( - url, - r'src="(.*?/videothek_js.*?\.js)', - 0, - [ - (1, 'url', u'ERROR: Invalid URL: %s' % url) - ] - ) - - http_host = url.split('/')[2] - next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url'))) - - info = self.grep_webpage( - next_url, - r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - '(http://.*?\.swf).*?' + - '(rtmp://.*?)\'', - re.DOTALL, - [ - (1, 'path', u'ERROR: could not extract video path: %s' % url), - (2, 'player', u'ERROR: could not extract video player: %s' % url), - (3, 'url', u'ERROR: could not extract video url: %s' % url) - ] - ) - - video_url = u'%s/%s' % (info.get('url'), info.get('path')) - - print u'rtmpdump --swfVfy \'%s\' --rtmp \'%s\' --live -o arte-live.mp4' % (info.get('player'), video_url) - - def extractPlus7Stream(self, url): - - video_lang = url.split('/')[-3] - - info = self.grep_webpage( - url, - r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', - 0, - [ - (1, 'url', u'ERROR: Invalid URL: %s' % url) - ] - ) - - next_url = urllib.unquote(info.get('url')) - - info = self.grep_webpage( - next_url, - r'