X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=4aec8c6879e79ccc13cf5e9bcfbf9abc17c4d1d1;hb=01ba4b80a732308e8da66ba89bac9273181db1ad;hp=3c95012b19ea45997b988341b466c09aff1a2776;hpb=af44c9486255f16ab180a9e45aaab06a6b38bdde;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 3c95012b1..4aec8c687 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -420,7 +420,7 @@ class YoutubeIE(InfoExtractor): def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') + sub_lang = self._downloader.params.get('subtitleslang') or 'en' sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) @@ -699,14 +699,14 @@ class YoutubeIE(InfoExtractor): pass else: # We report the original error - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) for video_subtitle in video_subtitles: (sub_error, sub_lang, sub) = video_subtitle if sub_error: - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): sub_lang_list = self._list_available_subtitles(video_id) @@ -732,8 +732,11 @@ class YoutubeIE(InfoExtractor): for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: - url = url_data['url'][0] + '&signature=' + url_data['sig'][0] - if not 'ratebypass' in url: url += '&ratebypass=yes' + url = url_data['url'][0] + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + if 'ratebypass' not in url: + url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url format_limit = self._downloader.params.get('format_limit', None) @@ -940,16 +943,10 @@ class DailymotionIE(InfoExtractor): video_title = unescapeHTML(mobj.group('title')) video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.report_warning(u'unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) @@ -1409,6 +1406,16 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit: JWPlayer JS loader mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) + if mobj is None: + # Try to find twitter cards info + mobj = re.search(r'\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): @@ -3372,11 +3381,19 @@ class SteamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') - videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID - self.report_age_confirmation() + + videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) - game_title = re.search(r'', webpage).group('game_title') - + + if re.search('

Please enter your birth date to continue:

', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + + self.report_extraction(gameID) + game_title = self._html_search_regex(r'', + webpage, 'game title') + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' @@ -3482,8 +3499,8 @@ class RBMARadioIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - json_data = self._search_regex(r'', - webpage, u'json data') + json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', + webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) @@ -3786,10 +3803,6 @@ class TEDIE(InfoExtractor): self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] - def _talk_video_link(self,mediaSlug): - '''Returns the video link for that mediaSlug''' - return 'http://download.ted.com/talks/%s.mp4' % mediaSlug - def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r''' @@ -3802,9 +3815,8 @@ class TEDIE(InfoExtractor): m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) - playlist_RE = r'div class="headline">(\s*?)

(\s*?)(?P.*?)' - m_playlist = re.search(playlist_RE, webpage) - playlist_title = m_playlist.group('playlist_title') + playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', + webpage, 'playlist title') playlist_entries = [] for m_video, m_name in zip(m_videos,m_names): @@ -3815,27 +3827,28 @@ class TEDIE(InfoExtractor): def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" - m=re.match(self._VALID_URL, url,re.VERBOSE) - videoName=m.group('name') - webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + m = re.match(self._VALID_URL, url,re.VERBOSE) + video_name = m.group('name') + webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) + self.report_extraction(video_name) # If the url includes the language we get the title translated - title_RE=r'(?P.*)</span>' - title=re.search(title_RE, webpage).group('title') - info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) - "id":(?P<videoID>[\d]+).*? - "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' - thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' - thumb_match=re.search(thumb_RE,webpage) - info_match=re.search(info_RE,webpage,re.VERBOSE) - video_id=info_match.group('videoID') - mediaSlug=info_match.group('mediaSlug') - video_url=self._talk_video_link(mediaSlug) + title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>', + webpage, 'title') + json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', + webpage, 'json data') + info = json.loads(json_data) + desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', + webpage, 'description', flags = re.DOTALL) + + thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', + webpage, 'thumbnail') info = { - 'id': video_id, - 'url': video_url, + 'id': info['id'], + 'url': info['htmlStreams'][-1]['file'], 'ext': 'mp4', 'title': title, - 'thumbnail': thumb_match.group('thumbnail') + 'thumbnail': thumbnail, + 'description': desc, } return info @@ -4001,6 +4014,64 @@ class ARDIE(InfoExtractor): info["url"] = stream["video_url"] return [info] +class ZDFIE(InfoExtractor): + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' + _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' + _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('video_id') + + html = self._download_webpage(url, video_id) + streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] + if streams is None: + raise ExtractorError(u'No media url found.') + + # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url + # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url + # choose first/default media type and highest quality for now + for s in streams: #find 300 - dsl1000mbit + if s['quality'] == '300' and s['media_type'] == 'wstreaming': + stream_=s + break + for s in streams: #find veryhigh - dsl2000mbit + if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working + stream_=s + break + if stream_ is None: + raise ExtractorError(u'No stream found.') + + media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + + self.report_extraction(video_id) + mobj = re.search(self._TITLE, html) + if mobj is None: + raise ExtractorError(u'Cannot extract title') + title = unescapeHTML(mobj.group('title')) + + mobj = re.search(self._MMS_STREAM, media_link) + if mobj is None: + mobj = re.search(self._RTSP_STREAM, media_link) + if mobj is None: + raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') + mms_url = mobj.group('video_url') + + mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) + if mobj is None: + raise ExtractorError(u'Cannot extract extention') + ext = mobj.group('ext') + + return [{'id': video_id, + 'url': mms_url, + 'title': title, + 'ext': ext + }] + class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' @@ -4394,6 +4465,115 @@ class HypemIE(InfoExtractor): 'artist': artist, }] +class Vbox7IE(InfoExtractor): + """Information Extractor for Vbox7""" + _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + + redirect_page, urlh = self._download_webpage_handle(url, video_id) + new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') + redirect_url = urlh.geturl() + new_location + webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') + + title = self._html_search_regex(r'<title>(.*)', + webpage, u'title').split('/')[0].strip() + + ext = "flv" + info_url = "http://vbox7.com/play/magare.do" + data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) + info_request = compat_urllib_request.Request(info_url, data) + info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') + if info_response is None: + raise ExtractorError(u'Unable to extract the media url') + (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + + return [{ + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + }] + +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('id') + video_type = mobj.group('type') + webpage = self._download_webpage(url, video_id) + if video_type == 'full-episodes': + mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' + else: + mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' + mgid = self._search_regex(mgid_re, webpage, u'mgid') + data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) + + info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, + video_id, u'Downloading video info') + links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, + video_id, u'Downloading video urls info') + + self.report_extraction(video_id) + info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .* + (?P.*?).* + ''' + + m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) + if m_info is None: + raise ExtractorError(u'Unable to extract video info') + video_title = m_info.group('title') + video_description = m_info.group('description') + video_thumb = m_info.group('thumb') + + m_urls = list(re.finditer(r'(?P.*)', links_webpage)) + if m_urls is None or len(m_urls) == 0: + raise ExtractError(u'Unable to extrat video url') + # They are sorted from worst to best quality + video_url = m_urls[-1].group('url') + + return {'url': video_url, + 'id': video_id, + 'title': video_title, + # Videos are actually flv not mp4 + 'ext': 'flv', + 'thumbnail': video_thumb, + 'description': video_description, + } + +class StatigrIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + video_url = re.search(r'',webpage).group(1) + thumbnail_url = re.search(r'',webpage).group(1) + title = (re.search(r'(.+?)',webpage).group(1)).strip("| Statigram") + uploader = re.search(r'@(.+) \(Videos\)',title).group(1) + ext = "mp4" + return [{ + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + 'uploader' : uploader + }] def gen_extractors(): """ Return a list of an instance of every supported extractor. @@ -4448,6 +4628,7 @@ def gen_extractors(): SpiegelIE(), LiveLeakIE(), ARDIE(), + ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), @@ -4458,6 +4639,9 @@ def gen_extractors(): TeamcocoIE(), XHamsterIE(), HypemIE(), + Vbox7IE(), + GametrailersIE(), + StatigrIE(), GenericIE() ]