From: Jaime Marquínez Ferrándiz Date: Mon, 23 Feb 2015 16:13:03 +0000 (+0100) Subject: Merge branch 'subtitles-rework' X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=bfc993cc9183d5f001e30267551bcdf9f0a98be9;hp=-c;p=youtube-dl.git Merge branch 'subtitles-rework' (Closes PR #4964) --- bfc993cc9183d5f001e30267551bcdf9f0a98be9 diff --combined Makefile index 573c82685,07c90c225..708732956 --- a/Makefile +++ b/Makefile @@@ -1,7 -1,7 +1,7 @@@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe PREFIX ?= /usr/local BINDIR ?= $(PREFIX)/bin @@@ -43,7 -43,7 +43,7 @@@ test ot: offlinetest offlinetest: codetest - nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists + nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py tar: youtube-dl.tar.gz diff --combined youtube_dl/YoutubeDL.py index ca7c3f5c6,70b364c9b..76fc394bc --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@@ -154,7 -154,7 +154,7 @@@ class YoutubeDL(object) allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@@ -199,25 -199,18 +199,25 @@@ postprocessor. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries - * status: One of "downloading" and "finished". + * status: One of "downloading", "error", or "finished". Check this first and ignore unknown values. - If status is one of "downloading" or "finished", the + If status is one of "downloading", or "finished", the following properties may also be present: * filename: The final filename (always present) + * tmpfilename: The filename we're currently writing to * downloaded_bytes: Bytes on disk * total_bytes: Size of the whole file, None if unknown - * tmpfilename: The filename we're currently writing to + * total_bytes_estimate: Guess of the eventual file size, + None if unavailable. + * elapsed: The number of seconds since download started. * eta: The estimated time in seconds, None if unknown * speed: The download speed in bytes/second, None if unknown + * fragment_index: The counter of the currently + downloaded video fragment. + * fragment_count: The number of fragments (= individual + files that will be merged) Progress hooks are guaranteed to be called at least once (with status "finished") if the download is successful. @@@ -232,6 -225,7 +232,6 @@@ call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. sleep_interval: Number of seconds to sleep before each download. - external_downloader: Executable of the external downloader to call. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@@ -241,10 -235,6 +241,10 @@@ match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. + The following options determine which downloader is picked: + external_downloader: Executable of the external downloader to call. + None or unset for standard (built-in) downloader. + hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@@ -308,8 -298,8 +308,8 @@@ raise if (sys.version_info >= (3,) and sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and + not params.get('restrictfilenames', False)): # On Python 3, the Unicode filesystem API will throw errors (#1474) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@@ -961,9 -951,30 +961,9 @@@ return res def _calc_cookies(self, info_dict): - class _PseudoRequest(object): - def __init__(self, url): - self.url = url - self.headers = {} - self.unverifiable = False - - def add_unredirected_header(self, k, v): - self.headers[k] = v - - def get_full_url(self): - return self.url - - def is_unverifiable(self): - return self.unverifiable - - def has_header(self, h): - return h in self.headers - - def get_header(self, h, default=None): - return self.headers.get(h, default) - - pr = _PseudoRequest(info_dict['url']) + pr = compat_urllib_request.Request(info_dict['url']) self.cookiejar.add_cookie_header(pr) - return pr.headers.get('Cookie') + return pr.get_header('Cookie') def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' @@@ -1008,6 -1019,15 +1008,15 @@@ info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + if 'automatic_captions' in info_dict: + self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') + self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + return + info_dict['requested_subtitles'] = self.process_subtitles( + info_dict['id'], info_dict.get('subtitles'), + info_dict.get('automatic_captions')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@@ -1136,6 -1156,55 +1145,55 @@@ info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, normal_subtitles, automatic_captions): + """Select the requested subtitles and their format""" + available_subs = {} + if normal_subtitles and self.params.get('writesubtitles'): + available_subs.update(normal_subtitles) + if automatic_captions and self.params.get('writeautomaticsub'): + for lang, cap_info in automatic_captions.items(): + if lang not in available_subs: + available_subs[lang] = cap_info + + if (not self.params.get('writesubtitles') and not + self.params.get('writeautomaticsub') or not + available_subs): + return None + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@@ -1238,15 -1307,22 +1296,22 @@@ subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) - if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: + if subtitles_are_requested and info_dict.get('requested_subtitles'): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE - subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + subtitles = info_dict['requested_subtitles'] + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@@ -1254,7 -1330,7 +1319,7 @@@ else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@@ -1287,7 -1363,7 +1352,7 @@@ downloaded = [] success = True merger = FFmpegMergerPP(self, not self.params.get('keepvideo')) - if not merger._executable: + if not merger.available: postprocessors = [] self.report_warning('You have requested multiple ' 'formats but ffmpeg or avconv are not installed.' @@@ -1366,8 -1442,8 +1431,8 @@@ """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in outtmpl - and self.params.get('max_downloads') != 1): + '%' not in outtmpl and + self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: @@@ -1534,18 -1610,29 +1599,18 @@@ return res def list_formats(self, info_dict): - def line(format, idlen=20): - return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % ( - format['format_id'], - format['ext'], - self.format_resolution(format), - self._format_note(format), - )) - formats = info_dict.get('formats', [info_dict]) - idlen = max(len('format code'), - max(len(f['format_id']) for f in formats)) - formats_s = [ - line(f, idlen) for f in formats + table = [ + [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)] + for f in formats if f.get('preference') is None or f['preference'] >= -1000] if len(formats) > 1: - formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)' + table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)' - header_line = line({ - 'format_id': 'format code', 'ext': 'extension', - 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen) + header_line = ['format code', 'extension', 'resolution', 'note'] self.to_screen( - '[info] Available formats for %s:\n%s\n%s' % - (info_dict['id'], header_line, '\n'.join(formats_s))) + '[info] Available formats for %s:\n%s' % + (info_dict['id'], render_table(header_line, table))) def list_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') @@@ -1564,6 -1651,17 +1629,17 @@@ ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles, name='subtitles'): + if not subtitles: + self.to_screen('%s has no %s' % (video_id, name)) + return + self.to_screen( + 'Available %s for %s:' % (name, video_id)) + self.to_screen(render_table( + ['Language', 'formats'], + [[lang, ', '.join(f['ext'] for f in reversed(formats))] + for lang, formats in subtitles.items()])) + def urlopen(self, req): """ Start an HTTP download """ @@@ -1625,7 -1723,7 +1701,7 @@@ self._write_string('[debug] Python version %s - %s\n' % ( platform.python_version(), platform_name())) - exe_versions = FFmpegPostProcessor.get_versions() + exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() exe_str = ', '.join( '%s %s' % (exe, v) diff --combined youtube_dl/__init__.py index 25ab3fdfe,5f2585003..5ce201800 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@@ -189,14 -189,14 +189,14 @@@ def _real_main(argv=None) # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems) if opts.outtmpl is not None: opts.outtmpl = opts.outtmpl.decode(preferredencoding()) - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) - or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') - or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') - or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') - or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') - or (opts.useid and '%(id)s.%(ext)s') - or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') - or DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or + (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or + (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or + (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or + (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or + (opts.useid and '%(id)s.%(ext)s') or + (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or + DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' @@@ -226,7 -226,6 +226,6 @@@ if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) @@@ -350,8 -349,6 +349,8 @@@ 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': match_filter, 'no_color': opts.no_color, + 'ffmpeg_location': opts.ffmpeg_location, + 'hls_prefer_native': opts.hls_prefer_native, } with YoutubeDL(ydl_opts) as ydl: diff --combined youtube_dl/extractor/common.py index 79f6d199b,7d8ce1808..87fce9cd8 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@@ -27,6 -27,7 +27,6 @@@ from ..utils import compiled_regex_type, ExtractorError, float_or_none, - HEADRequest, int_or_none, RegexNotFoundError, sanitize_filename, @@@ -150,8 -151,14 +150,14 @@@ class InfoExtractor(object) If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A url pointing to the subtitles file + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@@ -391,16 -398,6 +397,16 @@@ if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if 'The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

(.*?)

', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content @@@ -738,7 -735,6 +744,7 @@@ f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, f.get('tbr') if f.get('tbr') is not None else -1, + f.get('filesize') if f.get('filesize') is not None else -1, f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, @@@ -746,6 -742,7 +752,6 @@@ f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('source_preference') if f.get('source_preference') is not None else -1, f.get('format_id'), @@@ -762,7 -759,9 +768,7 @@@ def _is_valid_url(self, url, video_id, item='video'): try: - self._request_webpage( - HEADRequest(url), video_id, - 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): @@@ -808,8 -807,8 +814,8 @@@ media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' - + (media_el.attrib.get('href') or media_el.attrib.get('url'))) + manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + + (media_el.attrib.get('href') or media_el.attrib.get('url'))) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), @@@ -833,7 -832,7 +839,7 @@@ 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] @@@ -848,7 -847,6 +854,7 @@@ note='Downloading m3u8 information', errnote='Failed to download m3u8 information') last_info = None + last_media = None kv_rex = re.compile( r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): @@@ -859,13 -857,6 +865,13 @@@ if v.startswith('"'): v = v[1:-1] last_info[m.group('key')] = v + elif line.startswith('#EXT-X-MEDIA:'): + last_media = {} + for m in kv_rex.finditer(line): + v = m.group('val') + if v.startswith('"'): + v = v[1:-1] + last_media[m.group('key')] = v elif line.startswith('#') or not line.strip(): continue else: @@@ -894,9 -885,6 +900,9 @@@ width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + if last_media is not None: + f['m3u8_media'] = last_media + last_media = None formats.append(f) last_info = {} self._sort_formats(formats) @@@ -1011,6 -999,24 +1017,24 @@@ any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + + def extract_automatic_captions(self, *args, **kwargs): + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --combined youtube_dl/extractor/dailymotion.py index b2dbf4a92,4ca892926..42b20a46d --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@@ -6,7 -6,6 +6,6 @@@ import jso import itertools from .common import InfoExtractor - from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_str, @@@ -31,7 -30,7 +30,7 @@@ class DailymotionBaseInfoExtractor(Info return request - class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): + class DailymotionIE(DailymotionBaseInfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P[^/?_]+)' @@@ -143,9 -142,6 +142,6 @@@ # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, webpage) - return view_count = str_to_int(self._search_regex( r'video_views_count[^>]+>\s+([\d\.,]+)', @@@ -169,7 -165,7 +165,7 @@@ 'view_count': view_count, } - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@@ -179,7 -175,7 +175,7 @@@ return {} info = json.loads(sub_list) if (info['total'] > 0): - sub_lang_list = dict((l['language'], l['url']) for l in info['list']) + sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) return sub_lang_list self._downloader.report_warning('video doesn\'t have subtitles') return {} @@@ -194,7 -190,6 +190,7 @@@ class DailymotionPlaylistIE(Dailymotion 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', + 'id': 'xv4bw_nqtv_sport', }, 'playlist_mincount': 20, }] diff --combined youtube_dl/extractor/ted.py index 59678399d,0c38c8f89..4cec06f8b --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@@ -3,14 -3,14 +3,14 @@@ from __future__ import unicode_literal import json import re - from .subtitles import SubtitlesInfoExtractor + from .common import InfoExtractor from ..compat import ( compat_str, ) - class TEDIE(SubtitlesInfoExtractor): + class TEDIE(InfoExtractor): _VALID_URL = r'''(?x) (?Phttps?://) (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ @@@ -83,22 -83,6 +83,22 @@@ 'params': { 'skip_download': True, }, + }, { + # YouTube video + 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': 'aFBIPO-P7LM', + 'ext': 'mp4', + 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', + 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', + 'uploader': 'TEDx Talks', + 'uploader_id': 'TEDxTalks', + 'upload_date': '20111216', + }, + 'params': { + 'skip_download': True, + }, }] _NATIVE_FORMATS = { @@@ -148,16 -132,11 +148,16 @@@ talk_info = self._extract_info(webpage)['talks'][0] - if talk_info.get('external') is not None: - self.to_screen('Found video from %s' % talk_info['external']['service']) + external = talk_info.get('external') + if external: + service = external['service'] + self.to_screen('Found video from %s' % service) + ext_url = None + if service.lower() == 'youtube': + ext_url = external.get('code') return { '_type': 'url', - 'url': talk_info['external']['uri'], + 'url': ext_url or external['uri'], } formats = [{ @@@ -184,11 -163,6 +184,6 @@@ self._sort_formats(formats) video_id = compat_str(talk_info['id']) - # subtitles - video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@@ -199,21 -173,25 +194,25 @@@ 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'subtitles': video_subtitles, + 'subtitles': self._get_subtitles(video_id, talk_info), 'formats': formats, 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: - self._downloader.report_warning('video doesn\'t have subtitles') return {} def _watch_info(self, url, name): diff --combined youtube_dl/extractor/theplatform.py index f7b34bd26,5f24189cc..feac666f7 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@@ -8,7 -8,7 +8,7 @@@ import binasci import hashlib - from .subtitles import SubtitlesInfoExtractor + from .common import InfoExtractor from ..compat import ( compat_str, ) @@@ -22,7 -22,7 +22,7 @@@ from ..utils import _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) - class ThePlatformIE(SubtitlesInfoExtractor): + class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?P(?:[^/\?]+/(?:swf|config)|onsite)/select/)? @@@ -71,9 -71,7 +71,9 @@@ if not provider_id: provider_id = 'dJ5BDC' - if mobj.group('config'): + if smuggled_data.get('force_smil_url', False): + smil_url = url + elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') config_url = config_url.replace('onsite/', 'onsite/config/') @@@ -106,15 -104,11 +106,11 @@@ captions = info.get('captions') if isinstance(captions, list): for caption in captions: - lang, src = caption.get('lang'), caption.get('src') - if lang and src: - subtitles[lang] = src - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return - - subtitles = self.extract_subtitles(video_id, subtitles) + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles[lang] = [{ + 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'url': src, + }] head = meta.find(_x('smil:head')) body = meta.find(_x('smil:body')) diff --combined youtube_dl/extractor/vimeo.py index 4cd2f73d9,5930d5984..8f540f578 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@@ -4,10 -4,8 +4,9 @@@ from __future__ import unicode_literal import json import re import itertools +import hashlib from .common import InfoExtractor - from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_HTTPError, compat_urllib_parse, @@@ -19,7 -17,6 +18,7 @@@ from ..utils import InAdvancePagedList, int_or_none, RegexNotFoundError, + smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, @@@ -53,7 -50,7 +52,7 @@@ class VimeoBaseInfoExtractor(InfoExtrac self._download_webpage(login_request, None, False, 'Wrong login info') - class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs @@@ -176,7 -173,7 +175,7 @@@ def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option') + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') data = compat_urllib_parse.urlencode({ 'password': password, @@@ -226,11 -223,6 +225,11 @@@ if mobj.group('pro') or mobj.group('player'): url = 'http://player.vimeo.com/video/' + video_id + password = self._downloader.params.get('videopassword', None) + if password: + headers['Cookie'] = '%s_password=%s' % ( + video_id, hashlib.md5(password.encode('utf-8')).hexdigest()) + # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) try: @@@ -274,11 -266,8 +273,11 @@@ raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') if re.search(r']+?id="pw_form"', webpage) is not None: + if data and '_video_password_verified' in data: + raise ExtractorError('video password verification failed!') self._verify_video_password(url, video_id, webpage) - return self._real_extract(url) + return self._real_extract( + smuggle_url(url, {'_video_password_verified': 'verified'})) else: raise ExtractorError('Unable to extract info section', cause=e) @@@ -378,12 -367,10 +377,10 @@@ text_tracks = config['request'].get('text_tracks') if text_tracks: for tt in text_tracks: - subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] - - video_subtitles = self.extract_subtitles(video_id, subtitles) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, subtitles) - return + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': 'http://vimeo.com' + tt['url'], + }] return { 'id': video_id, @@@ -399,7 -386,7 +396,7 @@@ 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } @@@ -411,7 -398,6 +408,7 @@@ class VimeoChannelIE(InfoExtractor) _TESTS = [{ 'url': 'http://vimeo.com/channels/tributes', 'info_dict': { + 'id': 'tributes', 'title': 'Vimeo Tributes', }, 'playlist_mincount': 25, @@@ -490,7 -476,6 +487,7 @@@ class VimeoUserIE(VimeoChannelIE) 'url': 'http://vimeo.com/nkistudio/videos', 'info_dict': { 'title': 'Nki', + 'id': 'nkistudio', }, 'playlist_mincount': 66, }] @@@ -508,7 -493,6 +505,7 @@@ class VimeoAlbumIE(VimeoChannelIE) _TESTS = [{ 'url': 'http://vimeo.com/album/2632481', 'info_dict': { + 'id': '2632481', 'title': 'Staff Favorites: November 2013', }, 'playlist_mincount': 13, @@@ -539,7 -523,6 +536,7 @@@ class VimeoGroupsIE(VimeoAlbumIE) _TESTS = [{ 'url': 'http://vimeo.com/groups/rolexawards', 'info_dict': { + 'id': 'rolexawards', 'title': 'Rolex Awards for Enterprise', }, 'playlist_mincount': 73, @@@ -622,7 -605,6 +619,7 @@@ class VimeoLikesIE(InfoExtractor) 'url': 'https://vimeo.com/user755559/likes/', 'playlist_mincount': 293, "info_dict": { + 'id': 'user755559_likes', "description": "See all the videos urza likes", "title": 'Videos urza likes', }, diff --combined youtube_dl/extractor/youtube.py index 3d3d43491,1b2dbf276..22db896b1 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@@ -11,7 -11,6 +11,6 @@@ import tim import traceback from .common import InfoExtractor, SearchInfoExtractor - from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( @@@ -185,7 -184,7 +184,7 @@@ class YoutubeBaseInfoExtractor(InfoExtr return - class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ ( @@@ -541,30 -540,26 +540,30 @@@ if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) + download_note = ( + 'Downloading player %s' % player_url + if self._downloader.params.get('verbose') else + 'Downloading %s player %s' % (player_type, player_id) + ) if player_type == 'js': code = self._download_webpage( player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), + note=download_note, errnote='Download of %s failed' % player_url) res = self._parse_sig_js(code) elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, - note='Downloading %s player %s' % (player_type, player_id), + note=download_note, errnote='Download of %s failed' % player_url) code = urlh.read() res = self._parse_sig_swf(code) else: assert False, 'Invalid player type %r' % player_type - if cache_spec is None: - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] + test_string = ''.join(map(compat_chr, range(len(example_sig)))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res @@@ -648,7 -643,7 +647,7 @@@ raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_available_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@@ -662,23 -657,27 +661,27 @@@ lang = track.attrib['lang_code'] if lang in sub_lang_list: continue - params = compat_urllib_parse.urlencode({ - 'lang': lang, - 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': track.attrib['name'].encode('utf-8'), - }) - url = 'https://www.youtube.com/api/timedtext?' + params - sub_lang_list[lang] = url + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': ext, + 'name': track.attrib['name'].encode('utf-8'), + }) + sub_formats.append({ + 'url': 'https://www.youtube.com/api/timedtext?' + params, + 'ext': ext, + }) + sub_lang_list[lang] = sub_formats if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list - def _get_available_automatic_caption(self, video_id, webpage): + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id @@@ -708,14 -707,20 +711,20 @@@ sub_lang_list = {} for lang_node in caption_list.findall('target'): sub_lang = lang_node.attrib['lang_code'] - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': caption_kind, - }) - sub_lang_list[sub_lang] = caption_url + '&' + params + sub_formats = [] + for ext in ['sbv', 'vtt', 'srt']: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats return sub_lang_list # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles @@@ -970,10 -975,7 +979,7 @@@ # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) - - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, video_webpage) - return + automatic_captions = self.extract_automatic_captions(video_id, video_webpage) if 'length_seconds' not in video_info: self._downloader.report_warning('unable to extract video duration') @@@ -1122,6 -1124,7 +1128,7 @@@ 'description': video_description, 'categories': video_categories, 'subtitles': video_subtitles, + 'automatic_captions': automatic_captions, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, diff --combined youtube_dl/options.py index 5f678f76b,4fcf8c83d..5c2d153b1 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@@ -387,8 -387,8 +387,8 @@@ def parseOpts(overrideArguments=None) help='lists all available subtitles for the video') subtitles.add_option( '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', - help='subtitle format (default=srt) ([sbv/vtt] youtube only)') + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='subtitle format, accepts formats preference, for example: "ass/srt/best"') subtitles.add_option( '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', @@@ -424,10 -424,6 +424,10 @@@ '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', help='(experimental) set file xattribute ytdl.filesize with expected filesize') + downloader.add_option( + '--hls-prefer-native', + dest='hls_prefer_native', action='store_true', + help='(experimental) Use the native HLS downloader instead of ffmpeg.') downloader.add_option( '--external-downloader', dest='external_downloader', metavar='COMMAND', @@@ -739,10 -735,6 +739,10 @@@ '--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg', help='Prefer ffmpeg over avconv for running the postprocessors') + postproc.add_option( + '--ffmpeg-location', '--avconv-location', metavar='PATH', + dest='ffmpeg_location', + help='Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.') postproc.add_option( '--exec', metavar='CMD', dest='exec_cmd', diff --combined youtube_dl/postprocessor/ffmpeg.py index 3f2e6cf1d,e42298f0e..398fe050e --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@@ -30,95 -30,54 +30,95 @@@ class FFmpegPostProcessorError(PostProc class FFmpegPostProcessor(PostProcessor): def __init__(self, downloader=None, deletetempfiles=False): PostProcessor.__init__(self, downloader) - self._versions = self.get_versions() self._deletetempfiles = deletetempfiles + self._determine_executables() def check_version(self): - if not self._executable: + if not self.available: raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.') - required_version = '10-0' if self._uses_avconv() else '1.0' + required_version = '10-0' if self.basename == 'avconv' else '1.0' if is_outdated_version( - self._versions[self._executable], required_version): + self._versions[self.basename], required_version): warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % ( - self._executable, self._executable, required_version) + self.basename, self.basename, required_version) if self._downloader: self._downloader.report_warning(warning) @staticmethod - def get_versions(): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - return dict((p, get_exe_version(p, args=['-version'])) for p in programs) - - @property - def available(self): - return self._executable is not None + def get_versions(downloader=None): + return FFmpegPostProcessor(downloader)._versions - @property - def _executable(self): - if self._downloader.params.get('prefer_ffmpeg', False): + def _determine_executables(self): + programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] + prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False) + + self.basename = None + self.probe_basename = None + + self._paths = None + self._versions = None + if self._downloader: + location = self._downloader.params.get('ffmpeg_location') + if location is not None: + if not os.path.exists(location): + self._downloader.report_warning( + 'ffmpeg-location %s does not exist! ' + 'Continuing without avconv/ffmpeg.' % (location)) + self._versions = {} + return + elif not os.path.isdir(location): + basename = os.path.splitext(os.path.basename(location))[0] + if basename not in programs: + self._downloader.report_warning( + 'Cannot identify executable %s, its basename should be one of %s. ' + 'Continuing without avconv/ffmpeg.' % + (location, ', '.join(programs))) + self._versions = {} + return None + location = os.path.dirname(os.path.abspath(location)) + if basename in ('ffmpeg', 'ffprobe'): + prefer_ffmpeg = True + + self._paths = dict( + (p, os.path.join(location, p)) for p in programs) + self._versions = dict( + (p, get_exe_version(self._paths[p], args=['-version'])) + for p in programs) + if self._versions is None: + self._versions = dict( + (p, get_exe_version(p, args=['-version'])) for p in programs) + self._paths = dict((p, p) for p in programs) + + if prefer_ffmpeg: prefs = ('ffmpeg', 'avconv') else: prefs = ('avconv', 'ffmpeg') for p in prefs: if self._versions[p]: - return p - return None + self.basename = p + break - @property - def _probe_executable(self): - if self._downloader.params.get('prefer_ffmpeg', False): + if prefer_ffmpeg: prefs = ('ffprobe', 'avprobe') else: prefs = ('avprobe', 'ffprobe') for p in prefs: if self._versions[p]: - return p - return None + self.probe_basename = p + break + + @property + def available(self): + return self.basename is not None - def _uses_avconv(self): - return self._executable == 'avconv' + @property + def executable(self): + return self._paths[self.basename] + + @property + def probe_executable(self): + return self._paths[self.probe_basename] def run_ffmpeg_multiple_files(self, input_paths, out_path, opts): self.check_version() @@@ -129,7 -88,7 +129,7 @@@ files_cmd = [] for path in input_paths: files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)]) - cmd = ([encodeFilename(self._executable, True), encodeArgument('-y')] + + cmd = ([encodeFilename(self.executable, True), encodeArgument('-y')] + files_cmd + [encodeArgument(o) for o in opts] + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) @@@ -168,15 -127,13 +168,15 @@@ class FFmpegExtractAudioPP(FFmpegPostPr def get_audio_codec(self, path): - if not self._probe_executable: + if not self.probe_executable: raise PostProcessingError('ffprobe or avprobe not found. Please install one.') try: cmd = [ - encodeFilename(self._probe_executable, True), + encodeFilename(self.probe_executable, True), encodeArgument('-show_streams'), encodeFilename(self._ffmpeg_filename_argument(path), True)] + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] %s command line: %s' % (self.basename, shell_quote(cmd))) handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE, stdin=subprocess.PIPE) output = handle.communicate()[0] if handle.wait() != 0: @@@ -266,14 -223,14 +266,14 @@@ if self._nopostoverwrites and os.path.exists(encodeFilename(new_path)): self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path) else: - self._downloader.to_screen('[' + self._executable + '] Destination: ' + new_path) + self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) self.run_ffmpeg(path, new_path, acodec, more_opts) except: etype, e, tb = sys.exc_info() if isinstance(e, AudioConversionError): msg = 'audio conversion failed: ' + e.msg else: - msg = 'error running ' + self._executable + msg = 'error running ' + self.basename raise PostProcessingError(msg) # Try to update the date time for extracted audio file. @@@ -496,10 -453,6 +496,6 @@@ class FFmpegEmbedSubtitlePP(FFmpegPostP 'zu': 'zul', } - def __init__(self, downloader=None, subtitlesformat='srt'): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) - self._subformat = subtitlesformat - @classmethod def _conver_lang_code(cls, code): """Convert language code from ISO 639-1 to ISO 639-2/T""" @@@ -509,13 -462,14 +505,14 @@@ if information['ext'] != 'mp4': self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files') return True, information - if not information.get('subtitles'): + subtitles = information.get('requested_subtitles') + if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return True, information - sub_langs = [key for key in information['subtitles']] + sub_langs = list(subtitles.keys()) filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] opts = [ '-map', '0',