X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=a7fdf1607c4c73e4caef623a1598065e84b3a834;hb=d281274bf250065f876bb4f75fb6f711e1a26eba;hp=6328332a7d3f0813c410072e7b10156b40bbeb55;hpb=e5f30ade100b33127f31dd8989585a87e6faa6e4;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6328332a7..a7fdf1607 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -48,7 +48,7 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. + subtitles: The subtitle file contents. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -126,8 +126,36 @@ class InfoExtractor(object): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): """ Returns the data of the page as a string """ urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' webpage_bytes = urlh.read() - return webpage_bytes.decode('utf-8', 'replace') + return webpage_bytes.decode(encoding, 'replace') + + #Methods for following #608 + #They set the correct value of the '_type' key + def video_result(self, video_info): + """Returns a video""" + video_info['_type'] = 'video' + return video_info + def url_result(self, url, ie=None): + """Returns a url that points to a page that should be processed""" + #TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url} + return video_info + def playlist_result(self, entries, playlist_id=None, playlist_title=None): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + if playlist_id: + video_info['id'] = playlist_id + if playlist_title: + video_info['title'] = playlist_title + return video_info class YoutubeIE(InfoExtractor): @@ -218,7 +246,16 @@ class YoutubeIE(InfoExtractor): def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id) + + def report_video_subtitles_request(self, video_id, sub_lang, format): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" @@ -232,55 +269,75 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - - def _extract_subtitles(self, video_id): + def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) + return (u'unable to download video subtitles: %s' % compat_str(err), None) + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: + return (u'video doesn\'t have subtitles', None) + return sub_lang_list + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ + Return tuple: + (error_message, sub_lang, sub) + """ + self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ - 'lang': srt_lang, - 'name': srt_lang_list[srt_lang].encode('utf-8'), + 'lang': sub_lang, + 'name': sub_name, 'v': video_id, + 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: - srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8') + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: - return (u'WARNING: Did not fetch video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) + return (u'unable to download video subtitles: %s' % compat_str(err), None, None) + if not sub: + return (u'Did not fetch video subtitles', None, None) + return (None, sub_lang, sub) + + def _extract_subtitle(self, video_id): + """ + Return a list with a tuple: + [(error_message, sub_lang, sub)] + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] + + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + return [subtitle] + + def _extract_all_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + subtitles.append(subtitle) + return subtitles def _print_formats(self, formats): print('Available formats:') @@ -467,7 +524,7 @@ class YoutubeIE(InfoExtractor): if mobj is not None: video_uploader_id = mobj.group(1) else: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + self._downloader.report_warning(u'unable to extract uploader nickname') # title if 'title' not in video_info: @@ -477,7 +534,7 @@ class YoutubeIE(InfoExtractor): # thumbnail image if 'thumbnail_url' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') + self._downloader.report_warning(u'unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -501,15 +558,29 @@ class YoutubeIE(InfoExtractor): else: video_description = '' - # closed captions + # subtitles video_subtitles = None + if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) + video_subtitles = self._extract_subtitle(video_id) + if video_subtitles: + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_all_subtitles(video_id) + for video_subtitle in video_subtitles: + (sub_error, sub_lang, sub) = video_subtitle + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('listsubtitles', False): + sub_lang_list = self._list_available_subtitles(video_id) + return if 'length_seconds' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video duration') + self._downloader.report_warning(u'unable to extract video duration') video_duration = '' else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) @@ -652,8 +723,7 @@ class MetacafeIE(InfoExtractor): # Check if video comes from YouTube mobj2 = re.match(r'^yt-(.*)$', video_id) if mobj2 is not None: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) - return + return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))] # Retrieve video webpage to extract further information request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) @@ -785,7 +855,7 @@ class DailymotionIE(InfoExtractor): # lookin for official user mobj_official = re.search(r'', webpage) if mobj_official is None: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + self._downloader.report_warning(u'unable to extract uploader nickname') else: video_uploader = mobj_official.group(1) else: @@ -1281,7 +1351,8 @@ class GenericIE(InfoExtractor): def report_download_webpage(self, video_id): """Report webpage download.""" - self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') + if not self._downloader.params.get('test', False): + self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) def report_extraction(self, video_id): @@ -1293,7 +1364,7 @@ class GenericIE(InfoExtractor): self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) def _test_redirect(self, url): - """Check if it is a redirect, like url shorteners, in case restart chain.""" + """Check if it is a redirect, like url shorteners, in case return the new url.""" class HeadRequest(compat_urllib_request.Request): def get_method(self): return "HEAD" @@ -1344,20 +1415,15 @@ class GenericIE(InfoExtractor): return False self.report_following_redirect(new_url) - self._downloader.download([new_url]) - return True + return new_url def _real_extract(self, url): - if self._test_redirect(url): return + new_url = self._test_redirect(url) + if new_url: return [self.url_result(new_url)] video_id = url.split('/')[-1] - request = compat_urllib_request.Request(url) try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(url, video_id) except ValueError as err: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here @@ -1677,9 +1743,7 @@ class YoutubePlaylistIE(InfoExtractor): (?: (?:course|view_play_list|my_playlists|artist|playlist|watch) \? (?:.*?&)*? (?:p|a|list)= - | user/.*?/user/ | p/ - | user/.*?#[pg]/c/ ) ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) .* @@ -1742,23 +1806,9 @@ class YoutubePlaylistIE(InfoExtractor): page_num += 1 videos = [v[1] for v in sorted(videos)] - total = len(videos) - - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - if playlistend == -1: - videos = videos[playliststart:] - else: - videos = videos[playliststart:playlistend] - if len(videos) == total: - self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) - else: - self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) - - for video in videos: - self._downloader.download([video]) - return + url_results = [self.url_result(url) for url in videos] + return [self.playlist_result(url_results, playlist_id)] class YoutubeChannelIE(InfoExtractor): @@ -1808,9 +1858,9 @@ class YoutubeChannelIE(InfoExtractor): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries, channel_id)] class YoutubeUserIE(InfoExtractor): @@ -1880,20 +1930,9 @@ class YoutubeUserIE(InfoExtractor): pagenum += 1 - all_ids_count = len(video_ids) - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - - if playlistend == -1: - video_ids = video_ids[playliststart:] - else: - video_ids = video_ids[playliststart:playlistend] - - self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % - (username, all_ids_count, len(video_ids))) - - for video_id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) + urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] + url_results = [self.url_result(url) for url in urls] + return [self.playlist_result(url_results, playlist_title = username)] class BlipTVUserIE(InfoExtractor): @@ -1971,20 +2010,12 @@ class BlipTVUserIE(InfoExtractor): pagenum += 1 - all_ids_count = len(video_ids) - playliststart = self._downloader.params.get('playliststart', 1) - 1 - playlistend = self._downloader.params.get('playlistend', -1) - - if playlistend == -1: - video_ids = video_ids[playliststart:] - else: - video_ids = video_ids[playliststart:playlistend] - self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % (self.IE_NAME, username, all_ids_count, len(video_ids))) - for video_id in video_ids: - self._downloader.download([u'http://blip.tv/'+video_id]) + urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries, playlist_title = username)] class DepositFilesIE(InfoExtractor): @@ -2272,7 +2303,7 @@ class MyVideoIE(InfoExtractor): webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) - mobj = re.search(r'', + mobj = re.search(r'\s+(?P.*?)</a>", webpage) + m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) if not m: self._downloader.trouble(u'Cannot find video title') - title = unescapeHTML(m.group('title')) + title = clean_html(m.group('title')) m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) if m: @@ -3654,6 +3766,62 @@ class UstreamIE(InfoExtractor): } return [info] +class WorldStarHipHopIE(InfoExtractor): + _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(url).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id) + return + + _title = r"""<title>(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' @@ -3953,11 +4121,11 @@ class KeekIE(InfoExtractor): webpage = self._download_webpage(url, video_id) m = re.search(r'[\s\n]+

(?P\w+)

', webpage) - uploader = unescapeHTML(m.group('uploader')) + m = re.search(r'
[\S\s]+?

(?P.+?)

', webpage) + uploader = clean_html(m.group('uploader')) info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'mp4', 'title': title, 'thumbnail': thumbnail, @@ -4018,7 +4186,7 @@ class TEDIE(InfoExtractor): videoName=m.group('name') webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) # If the url includes the language we get the title translated - title_RE=r'

(?P.*)</span></h1>' + title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>' title=re.search(title_RE, webpage).group('title') info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) "id":(?P<videoID>[\d]+).*? @@ -4094,6 +4262,89 @@ class MySpassIE(InfoExtractor): } return [info] +class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage) + if not m: + raise ExtractorError(u'Cannot find title') + video_title = unescapeHTML(m.group(1)) + + xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_code = self._download_webpage(xml_url, video_id, + note=u'Downloading XML', errnote=u'Failed to download XML') + + idoc = xml.etree.ElementTree.fromstring(xml_code) + last_type = idoc[-1] + filename = last_type.findall('./filename')[0].text + duration = float(last_type.findall('./duration')[0].text) + + video_url = 'http://video2.spiegel.de/flash/' + filename + video_ext = filename.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': video_title, + 'duration': duration, + } + return [info] + +class LiveLeakIE(InfoExtractor): + + _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'liveleak' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + m = re.search(r'file: "(.*?)",', webpage) + if not m: + self._downloader.report_error(u'unable to find video url') + return + video_url = m.group(1) + + m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) + if not m: + self._downloader.trouble(u'Cannot find video title') + title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() + + m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) + if m: + desc = unescapeHTML(m.group('desc')) + else: + desc = None + + m = re.search(r'By:.*?(\w+)</a>', webpage) + if m: + uploader = clean_html(m.group(1)) + else: + uploader = None + + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'description': desc, + 'uploader': uploader + } + + return [info] + + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4120,6 +4371,7 @@ def gen_extractors(): EscapistIE(), CollegeHumorIE(), XVideosIE(), + SoundcloudSetIE(), SoundcloudIE(), InfoQIE(), MixcloudIE(), @@ -4133,6 +4385,7 @@ def gen_extractors(): GooglePlusIE(), ArteTvIE(), NBAIE(), + WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), SteamIE(), @@ -4142,7 +4395,7 @@ def gen_extractors(): KeekIE(), TEDIE(), MySpassIE(), + SpiegelIE(), + LiveLeakIE(), GenericIE() ] - -