X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=4aec8c6879e79ccc13cf5e9bcfbf9abc17c4d1d1;hb=01ba4b80a732308e8da66ba89bac9273181db1ad;hp=2f926f24363947d38c129af8828cfab27f3dad7f;hpb=0251f9c9c085234505b8a65b066ff54052d5fcdb;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 2f926f243..4aec8c687 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -420,7 +420,7 @@ class YoutubeIE(InfoExtractor): def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') + sub_lang = self._downloader.params.get('subtitleslang') or 'en' sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) @@ -699,14 +699,14 @@ class YoutubeIE(InfoExtractor): pass else: # We report the original error - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) for video_subtitle in video_subtitles: (sub_error, sub_lang, sub) = video_subtitle if sub_error: - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): sub_lang_list = self._list_available_subtitles(video_id) @@ -732,8 +732,11 @@ class YoutubeIE(InfoExtractor): for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: - url = url_data['url'][0] + '&signature=' + url_data['sig'][0] - if not 'ratebypass' in url: url += '&ratebypass=yes' + url = url_data['url'][0] + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + if 'ratebypass' not in url: + url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url format_limit = self._downloader.params.get('format_limit', None) @@ -940,16 +943,10 @@ class DailymotionIE(InfoExtractor): video_title = unescapeHTML(mobj.group('title')) video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.report_warning(u'unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) @@ -1412,6 +1409,13 @@ class GenericIE(InfoExtractor): if mobj is None: # Try to find twitter cards info mobj = re.search(r'\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): @@ -3374,11 +3381,19 @@ class SteamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') - videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID - self.report_age_confirmation() + + videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) - game_title = re.search(r'', webpage).group('game_title') - + + if re.search('

Please enter your birth date to continue:

', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + + self.report_extraction(gameID) + game_title = self._html_search_regex(r'', + webpage, 'game title') + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' @@ -3484,8 +3499,8 @@ class RBMARadioIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - json_data = self._search_regex(r'', - webpage, u'json data') + json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', + webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) @@ -3788,10 +3803,6 @@ class TEDIE(InfoExtractor): self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] - def _talk_video_link(self,mediaSlug): - '''Returns the video link for that mediaSlug''' - return 'http://download.ted.com/talks/%s.mp4' % mediaSlug - def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r''' @@ -3804,9 +3815,8 @@ class TEDIE(InfoExtractor): m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) - playlist_RE = r'div class="headline">(\s*?)

(\s*?)(?P.*?)' - m_playlist = re.search(playlist_RE, webpage) - playlist_title = m_playlist.group('playlist_title') + playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', + webpage, 'playlist title') playlist_entries = [] for m_video, m_name in zip(m_videos,m_names): @@ -3817,27 +3827,28 @@ class TEDIE(InfoExtractor): def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" - m=re.match(self._VALID_URL, url,re.VERBOSE) - videoName=m.group('name') - webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + m = re.match(self._VALID_URL, url,re.VERBOSE) + video_name = m.group('name') + webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) + self.report_extraction(video_name) # If the url includes the language we get the title translated - title_RE=r'(?P.*)</span>' - title=re.search(title_RE, webpage).group('title') - info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) - "id":(?P<videoID>[\d]+).*? - "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' - thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' - thumb_match=re.search(thumb_RE,webpage) - info_match=re.search(info_RE,webpage,re.VERBOSE) - video_id=info_match.group('videoID') - mediaSlug=info_match.group('mediaSlug') - video_url=self._talk_video_link(mediaSlug) + title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>', + webpage, 'title') + json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', + webpage, 'json data') + info = json.loads(json_data) + desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', + webpage, 'description', flags = re.DOTALL) + + thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', + webpage, 'thumbnail') info = { - 'id': video_id, - 'url': video_url, + 'id': info['id'], + 'url': info['htmlStreams'][-1]['file'], 'ext': 'mp4', 'title': title, - 'thumbnail': thumb_match.group('thumbnail') + 'thumbnail': thumbnail, + 'description': desc, } return info @@ -4541,6 +4552,29 @@ class GametrailersIE(InfoExtractor): 'description': video_description, } +class StatigrIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + video_url = re.search(r'<meta property="og:video:secure_url" content="(.+?)">',webpage).group(1) + thumbnail_url = re.search(r'<meta property="og:image" content="(.+?)" />',webpage).group(1) + title = (re.search(r'<title>(.+?)',webpage).group(1)).strip("| Statigram") + uploader = re.search(r'@(.+) \(Videos\)',title).group(1) + ext = "mp4" + return [{ + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + 'uploader' : uploader + }] + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4607,6 +4641,7 @@ def gen_extractors(): HypemIE(), Vbox7IE(), GametrailersIE(), + StatigrIE(), GenericIE() ]