From: Jaime Marquínez Ferrándiz Date: Sat, 12 Apr 2014 13:52:42 +0000 (+0200) Subject: Merge branch 'atomicparsley' (closes #2436) X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=77477fa4c916599e7eaa236a3f3eb5703923cf91;hp=a169e18ce16567ef123007a22758bcda48d07f92;p=youtube-dl.git Merge branch 'atomicparsley' (closes #2436) --- diff --git a/MANIFEST.in b/MANIFEST.in index 08be9af71..d43cc1f3b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,5 +3,4 @@ include test/*.py include test/*.json include youtube-dl.bash-completion include youtube-dl.1 -recursive-include docs * -prune docs/_build +recursive-include docs Makefile conf.py *.rst diff --git a/README.md b/README.md index 95795c315..1ba1486d2 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ which means you can modify it, redistribute it or use it however you like. configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows) + --encoding ENCODING Force the specified encoding (experimental) ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) @@ -169,6 +170,7 @@ which means you can modify it, redistribute it or use it however you like. ## Verbosity / Simulation Options: -q, --quiet activates quiet mode + --no-warnings Ignore warnings -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video @@ -180,7 +182,9 @@ which means you can modify it, redistribute it or use it however you like. --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information + -j, --dump-json simulate, quiet but print JSON information. + See --output for a description of available + keys. --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar @@ -367,7 +371,67 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py TestDownload.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). +If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): + +1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) +2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` +3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + + # coding: utf-8 + from __future__ import unicode_literals + + import re + + from .common import InfoExtractor + + + class YourExtractorIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + # TODO more properties (see youtube_dl/extractor/common.py) + } + + +5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501). +9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this: + + $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add new extractor' + $ git push origin yourextractor + +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. + +In any case, thank you very much for your contributions! # BUGS diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1f3ccaea0..2902dbec7 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -26,16 +26,27 @@ class YDL(FakeYDL): self.msgs.append(msg) +def _make_result(formats, **kwargs): + res = { + 'formats': formats, + 'id': 'testid', + 'title': 'testttitle', + 'extractor': 'testex', + } + res.update(**kwargs) + return res + + class TestFormatSelection(unittest.TestCase): def test_prefer_free_formats(self): # Same resolution => download webm ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460}, - {'ext': 'mp4', 'height': 460}, + {'ext': 'webm', 'height': 460, 'url': 'x'}, + {'ext': 'mp4', 'height': 460, 'url': 'y'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) @@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 1080}, + {'ext': 'webm', 'height': 720, 'url': 'a'}, + {'ext': 'mp4', 'height': 1080, 'url': 'b'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -60,9 +71,9 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 720}, - {'ext': 'flv', 'height': 720}, + {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'mp4', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720}, - {'ext': 'webm', 'height': 720}, + {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase): {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, ] - info_dict = { - 'formats': formats, 'extractor': 'test', 'id': 'testvid'} + info_dict = _make_result(formats) ydl = YDL() ydl.process_ie_result(info_dict) @@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1}, - {'format_id': '45', 'ext': 'webm', 'preference': 2}, - {'format_id': '47', 'ext': 'webm', 'preference': 3}, - {'format_id': '2', 'ext': 'flv', 'preference': 4}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': '20/47'}) ydl.process_ie_result(info_dict.copy()) @@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio'}) ydl.process_ie_result(info_dict.copy()) @@ -172,10 +182,10 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl.process_ie_result(info_dict.copy()) @@ -184,11 +194,11 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_video(self): formats = [ - {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none'}, - {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 3}, + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestvideo'}) ydl.process_ie_result(info_dict.copy()) @@ -217,10 +227,12 @@ class TestFormatSelection(unittest.TestCase): for f1id, f2id in zip(order, order[1:]): f1 = YoutubeIE._formats[f1id].copy() f1['format_id'] = f1id + f1['url'] = 'url:' + f1id f2 = YoutubeIE._formats[f2id].copy() f2['format_id'] = f2id + f2['url'] = 'url:' + f2id - info_dict = {'formats': [f1, f2], 'extractor': 'youtube'} + info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) @@ -228,7 +240,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1id) - info_dict = {'formats': [f2, f1], 'extractor': 'youtube'} + info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 39ac8b8a1..fb39c5082 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -143,5 +143,38 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) + def test_ComedyCentralShows(self): + self.assertMatch( + 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', + ['ComedyCentralShows']) + + def test_yahoo_https(self): + # https://github.com/rg3/youtube-dl/issues/2701 + self.assertMatch( + 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', + ['Yahoo']) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_playlists.py b/test/test_playlists.py index 4af38632e..75c6a6bbb 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -42,6 +42,7 @@ from youtube_dl.extractor import ( ToypicsUserIE, XTubeUserIE, InstagramUserIE, + CSpanIE, ) @@ -314,6 +315,18 @@ class TestPlaylists(unittest.TestCase): } expect_info_dict(self, EXPECTED, test_video) + def test_CSpan_playlist(self): + dl = FakeYDL() + ie = CSpanIE(dl) + result = ie.extract( + 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '342759') + self.assertEqual( + result['title'], 'General Motors Ignition Switch Recall') + whole_duration = sum(e['duration'] for e in result['entries']) + self.assertEqual(whole_duration, 14855) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 2348c0415..51eb0b6b9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -38,6 +38,7 @@ from youtube_dl.utils import ( xpath_with_ns, parse_iso8601, strip_jsonp, + uppercase_escape, ) if sys.version_info < (3, 0): @@ -279,6 +280,9 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, [{"id": "532cb", "x": 3}]) + def test_uppercase_escpae(self): + self.assertEqual(uppercase_escape(u'aä'), u'aä') + self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐') if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py old mode 100644 new mode 100755 index ae0ec49f8..d4dd05d8c --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -8,6 +8,7 @@ import datetime import errno import io import json +import locale import os import platform import re @@ -159,6 +160,7 @@ class YoutubeDL(object): include_ads: Download ads as well default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -284,6 +286,9 @@ class YoutubeDL(object): """Print message to stdout if not in quiet mode.""" return self.to_stdout(message, skip_eol, check_quiet=True) + def _write_string(self, s, out=None): + write_string(s, out=out, encoding=self.params.get('encoding')) + def to_stdout(self, message, skip_eol=False, check_quiet=False): """Print message to stdout if not in quiet mode.""" if self.params.get('logger'): @@ -293,7 +298,7 @@ class YoutubeDL(object): terminator = ['\n', ''][skip_eol] output = message + terminator - write_string(output, self._screen_file) + self._write_string(output, self._screen_file) def to_stderr(self, message): """Print message to stderr.""" @@ -303,7 +308,7 @@ class YoutubeDL(object): else: message = self._bidi_workaround(message) output = message + '\n' - write_string(output, self._err_file) + self._write_string(output, self._err_file) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -313,21 +318,21 @@ class YoutubeDL(object): # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: - write_string('\033]0;%s\007' % message, self._screen_file) + self._write_string('\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Save the title on stack - write_string('\033[22;0t', self._screen_file) + self._write_string('\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Restore the title from stack - write_string('\033[23;0t', self._screen_file) + self._write_string('\033[23;0t', self._screen_file) def __enter__(self): self.save_console_title() @@ -700,6 +705,11 @@ class YoutubeDL(object): def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result') + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -731,6 +741,9 @@ class YoutubeDL(object): # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): + if 'url' not in format: + raise ExtractorError('Missing "url" key in result (index %d)' % i) + if format.get('format_id') is None: format['format_id'] = compat_str(i) if format.get('format') is None: @@ -741,7 +754,7 @@ class YoutubeDL(object): ) # Automatically determine file extension if missing if 'ext' not in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() format_limit = self.params.get('format_limit', None) if format_limit: @@ -866,7 +879,7 @@ class YoutubeDL(object): try: dn = os.path.dirname(encodeFilename(filename)) - if dn != '' and not os.path.exists(dn): + if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: self.report_error('unable to create directory ' + compat_str(err)) @@ -923,7 +936,7 @@ class YoutubeDL(object): with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + descfn) + self.report_error('Cannot write subtitles file ' + sub_filename) return if self.params.get('writeinfojson', False): @@ -1200,7 +1213,17 @@ class YoutubeDL(object): def print_debug_header(self): if not self.params.get('verbose'): return - write_string('[debug] youtube-dl version ' + __version__ + '\n') + + write_string( + '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + sys.stdout.encoding, + self.get_encoding()), + encoding=None + ) + + self._write_string('[debug] youtube-dl version ' + __version__ + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -1209,20 +1232,20 @@ class YoutubeDL(object): out, err = sp.communicate() out = out.decode().strip() if re.match('[0-9a-f]+', out): - write_string('[debug] Git HEAD: ' + out + '\n') + self._write_string('[debug] Git HEAD: ' + out + '\n') except: try: sys.exc_clear() except: pass - write_string('[debug] Python version %s - %s' % + self._write_string('[debug] Python version %s - %s' % (platform.python_version(), platform_name()) + '\n') proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') def _setup_opener(self): timeout_val = self.params.get('socket_timeout') @@ -1264,3 +1287,19 @@ class YoutubeDL(object): # (See https://github.com/rg3/youtube-dl/issues/1309 for details) opener.addheaders = [] self._opener = opener + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4f4ec3871..42ef13786 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -51,6 +51,8 @@ __authors__ = ( 'David Wagner', 'Juan C. Olivares', 'Mattias Harrysson', + 'phaer', + 'Sainyam Kapoor', ) __license__ = 'Public Domain' @@ -243,7 +245,7 @@ def parseOpts(overrideArguments=None): help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option( - '--prefer-insecure', action='store_true', dest='prefer_insecure', + '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure', help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', @@ -257,13 +259,17 @@ def parseOpts(overrideArguments=None): general.add_option( '--bidi-workaround', dest='bidi_workaround', action='store_true', help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - general.add_option('--default-search', - dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') + general.add_option( + '--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') general.add_option( '--ignore-config', action='store_true', help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') + general.add_option( + '--encoding', dest='encoding', metavar='ENCODING', + help='Force the specified encoding (experimental)') selection.add_option( '--playlist-start', @@ -397,7 +403,7 @@ def parseOpts(overrideArguments=None): help='simulate, quiet but print output format', default=False) verbosity.add_option('-j', '--dump-json', action='store_true', dest='dumpjson', - help='simulate, quiet but print JSON information', default=False) + help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False) verbosity.add_option('--newline', action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) verbosity.add_option('--no-progress', @@ -543,8 +549,6 @@ def parseOpts(overrideArguments=None): write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') - write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' % - (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding())) return parser, opts, args @@ -678,7 +682,7 @@ def _real_main(argv=None): date = DateRange.day(opts.date) else: date = DateRange(opts.dateafter, opts.datebefore) - if opts.default_search not in ('auto', None) and ':' not in opts.default_search: + if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search: parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') # Do not download videos when there are audio-only formats @@ -789,6 +793,7 @@ def _real_main(argv=None): 'include_ads': opts.include_ads, 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'encoding': opts.encoding, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5a068aa8b..917f3450e 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,9 +4,10 @@ import sys import time from ..utils import ( + compat_str, encodeFilename, - timeconvert, format_bytes, + timeconvert, ) @@ -173,7 +174,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error(u'unable to rename file: %s' % str(err)) + self.report_error(u'unable to rename file: %s' % compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 4e6abfe10..e6be6ae6c 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -297,6 +297,7 @@ class F4mFD(FileDownloader): break frags_filenames.append(frag_filename) + dest_stream.close() self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 685fc749d..3a91e1a46 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -32,6 +32,7 @@ from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE +from .cbsnews import CBSNewsIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE @@ -40,6 +41,7 @@ from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cmt import CMTIE +from .cnet import CNETIE from .cnn import ( CNNIE, CNNBlogsIE, @@ -61,6 +63,7 @@ from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .divxstage import DivxStageIE from .dropbox import DropboxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE @@ -83,6 +86,7 @@ from .fktv import ( ) from .flickr import FlickrIE from .fourtube import FourTubeIE +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -152,10 +156,14 @@ from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE from .mooshare import MooshareIE +from .morningstar import MorningstarIE +from .motorsport import MotorsportIE +from .movshare import MovShareIE from .mtv import ( MTVIE, MTVIggyIE, ) +from .musicplayon import MusicPlayOnIE from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE @@ -177,6 +185,8 @@ from .normalboots import NormalbootsIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .ntv import NTVIE +from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE from .parliamentliveuk import ParliamentLiveUKIE @@ -198,6 +208,7 @@ from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .rts import RTSIE +from .rtve import RTVEALaCartaIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -256,6 +267,7 @@ from .udemy import ( UdemyCourseIE ) from .unistra import UnistraIE +from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE @@ -268,6 +280,7 @@ from .videodetective import VideoDetectiveIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE +from .videoweed import VideoWeedIE from .vimeo import ( VimeoIE, VimeoChannelIE, @@ -282,7 +295,10 @@ from .vk import VKIE from .vube import VubeIE from .washingtonpost import WashingtonPostIE from .wat import WatIE -from .wdr import WDRIE +from .wdr import ( + WDRIE, + WDRMausIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 922cede05..dc8657b67 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -6,7 +6,6 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, - determine_ext, ) @@ -16,9 +15,10 @@ class AppleTrailersIE(InfoExtractor): "url": "http://trailers.apple.com/trailers/wb/manofsteel/", "playlist": [ { - "file": "manofsteel-trailer4.mov", "md5": "d97a8e575432dbcb81b7c3acb741f8a8", "info_dict": { + "id": "manofsteel-trailer4", + "ext": "mov", "duration": 111, "title": "Trailer 4", "upload_date": "20130523", @@ -26,9 +26,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer3.mov", "md5": "b8017b7131b721fb4e8d6f49e1df908c", "info_dict": { + "id": "manofsteel-trailer3", + "ext": "mov", "duration": 182, "title": "Trailer 3", "upload_date": "20130417", @@ -36,9 +37,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer.mov", "md5": "d0f1e1150989b9924679b441f3404d48", "info_dict": { + "id": "manofsteel-trailer", + "ext": "mov", "duration": 148, "title": "Trailer", "upload_date": "20121212", @@ -46,15 +48,16 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-teaser.mov", "md5": "5fe08795b943eb2e757fa95cb6def1cb", "info_dict": { + "id": "manofsteel-teaser", + "ext": "mov", "duration": 93, "title": "Teaser", "upload_date": "20120721", "uploader_id": "wb", }, - } + }, ] } @@ -65,16 +68,16 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): - s = re.sub(r'(?s).*?', u'', s) + s = re.sub(r'(?s).*?', '', s) s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') s = re.sub(self._JSON_RE, _clean_json, s) - s = u'' + s + u'' + s = '' + s + u'' return s doc = self._download_xml(playlist_url, movie, transform_source=fix_html) @@ -82,7 +85,7 @@ class AppleTrailersIE(InfoExtractor): for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] trailer_info_json = self._search_regex(self._JSON_RE, - on_click, u'trailer info') + on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() @@ -98,8 +101,7 @@ class AppleTrailersIE(InfoExtractor): first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') - settings = json.loads(settings_json) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') formats = [] for format in settings['metadata']['sizes']: @@ -107,7 +109,6 @@ class AppleTrailersIE(InfoExtractor): format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) formats.append({ 'url': format_url, - 'ext': determine_ext(format_url), 'format': format['type'], 'width': format['width'], 'height': int(format['height']), diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2415ce403..25fb79e14 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -1,22 +1,21 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from .ooyala import OoyalaIE class BloombergIE(InfoExtractor): _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' _TEST = { - u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', - u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', - u'info_dict': { - u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', - u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', - }, - u'params': { - # Requires ffmpeg (m3u8 manifest) - u'skip_download': True, + 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + 'md5': '7bf08858ff7c203c870e8a6190e221e5', + 'info_dict': { + 'id': 'qurhIVlJSB6hzkVi229d8g', + 'ext': 'flv', + 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', + 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', }, } @@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - embed_code = self._search_regex( - r'[a-z0-9\-]+)\.html$" - _BASE_URL = "http://www.br.de" + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P[a-z0-9\-]+)\.html' + _BASE_URL = 'http://www.br.de' _TESTS = [ { - "url": "http://www.br.de/mediathek/video/anselm-gruen-114.html", - "md5": "c4f83cf0f023ba5875aba0bf46860df2", - "info_dict": { - "id": "2c8d81c5-6fb7-4a74-88d4-e768e5856532", - "ext": "mp4", - "title": "Feiern und Verzichten", - "description": "Anselm Grün: Feiern und Verzichten", - "uploader": "BR/Birgit Baier", - "upload_date": "20140301" + 'url': 'http://www.br.de/mediathek/video/anselm-gruen-114.html', + 'md5': 'c4f83cf0f023ba5875aba0bf46860df2', + 'info_dict': { + 'id': '2c8d81c5-6fb7-4a74-88d4-e768e5856532', + 'ext': 'mp4', + 'title': 'Feiern und Verzichten', + 'description': 'Anselm Grün: Feiern und Verzichten', + 'uploader': 'BR/Birgit Baier', + 'upload_date': '20140301', } }, { - "url": "http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html", - "md5": "ab451b09d861dbed7d7cc9ab0be19ebe", - "info_dict": { - "id": "2c060e69-3a27-4e13-b0f0-668fac17d812", - "ext": "mp4", - "title": "Über den Pass", - "description": "Die Eroberung der Alpen: Über den Pass", - "uploader": None, - "upload_date": None + 'url': 'http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html', + 'md5': 'ab451b09d861dbed7d7cc9ab0be19ebe', + 'info_dict': { + 'id': '2c060e69-3a27-4e13-b0f0-668fac17d812', + 'ext': 'mp4', + 'title': 'Über den Pass', + 'description': 'Die Eroberung der Alpen: Über den Pass', } - } + }, + { + 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html', + 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820', + 'info_dict': { + 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab', + 'ext': 'aac', + 'title': '"Keine neuen Schulden im nächsten Jahr"', + 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', + } + }, + { + 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', + 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', + 'info_dict': { + 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', + 'ext': 'mp4', + 'title': 'Umweltbewusster Häuslebauer', + 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', + } + }, + { + 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', + 'md5': '23bca295f1650d698f94fc570977dae3', + 'info_dict': { + 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', + 'ext': 'mp4', + 'title': 'Folge 1 - Metaphysik', + 'description': 'Kant für Anfänger: Folge 1 - Metaphysik', + 'uploader': 'Eva Maria Steimle', + 'upload_date': '20140117', + } + }, ] def _real_extract(self, url): @@ -44,56 +77,63 @@ class BRIE(InfoExtractor): display_id = mobj.group('id') page = self._download_webpage(url, display_id) xml_url = self._search_regex( - r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/mediathek/video/[a-z0-9/~_.-]+)'}\)\);", page, "XMLURL") + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') xml = self._download_xml(self._BASE_URL + xml_url, None) - videos = [] - for xml_video in xml.findall("video"): - video = { - "id": xml_video.get("externalId"), - "title": xml_video.find("title").text, - "formats": self._extract_formats(xml_video.find("assets")), - "thumbnails": self._extract_thumbnails(xml_video.find("teaserImage/variants")), - "description": " ".join(xml_video.find("shareTitle").text.splitlines()), - "webpage_url": xml_video.find("permalink").text + medias = [] + + for xml_media in xml.findall('video') + xml.findall('audio'): + media = { + 'id': xml_media.get('externalId'), + 'title': xml_media.find('title').text, + 'formats': self._extract_formats(xml_media.find('assets')), + 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), + 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), + 'webpage_url': xml_media.find('permalink').text } - if xml_video.find("author").text: - video["uploader"] = xml_video.find("author").text - if xml_video.find("broadcastDate").text: - video["upload_date"] = "".join(reversed(xml_video.find("broadcastDate").text.split("."))) - videos.append(video) + if xml_media.find('author').text: + media['uploader'] = xml_media.find('author').text + if xml_media.find('broadcastDate').text: + media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) + medias.append(media) - if len(videos) > 1: + if len(medias) > 1: self._downloader.report_warning( - 'found multiple videos; please ' + 'found multiple medias; please ' 'report this with the video URL to http://yt-dl.org/bug') - if not videos: - raise ExtractorError('No video entries found') - return videos[0] + if not medias: + raise ExtractorError('No media entries found') + return medias[0] def _extract_formats(self, assets): + + def text_or_none(asset, tag): + elem = asset.find(tag) + return None if elem is None else elem.text + formats = [{ - "url": asset.find("downloadUrl").text, - "ext": asset.find("mediaType").text, - "format_id": asset.get("type"), - "width": int(asset.find("frameWidth").text), - "height": int(asset.find("frameHeight").text), - "tbr": int(asset.find("bitrateVideo").text), - "abr": int(asset.find("bitrateAudio").text), - "vcodec": asset.find("codecVideo").text, - "container": asset.find("mediaType").text, - "filesize": int(asset.find("size").text), - } for asset in assets.findall("asset") - if asset.find("downloadUrl") is not None] + 'url': text_or_none(asset, 'downloadUrl'), + 'ext': text_or_none(asset, 'mediaType'), + 'format_id': asset.get('type'), + 'width': int_or_none(text_or_none(asset, 'frameWidth')), + 'height': int_or_none(text_or_none(asset, 'frameHeight')), + 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), + 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), + 'vcodec': text_or_none(asset, 'codecVideo'), + 'acodec': text_or_none(asset, 'codecAudio'), + 'container': text_or_none(asset, 'mediaType'), + 'filesize': int_or_none(text_or_none(asset, 'size')), + } for asset in assets.findall('asset') + if asset.find('downloadUrl') is not None] self._sort_formats(formats) return formats def _extract_thumbnails(self, variants): thumbnails = [{ - "url": self._BASE_URL + variant.find("url").text, - "width": int(variant.find("width").text), - "height": int(variant.find("height").text), - } for variant in variants.findall("variant")] - thumbnails.sort(key=lambda x: x["width"] * x["height"], reverse=True) + 'url': self._BASE_URL + variant.find('url').text, + 'width': int_or_none(variant.find('width').text), + 'height': int_or_none(variant.find('height').text), + } for variant in variants.findall('variant')] + thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) return thumbnails diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 85635d1cc..1bfc9f35b 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -27,9 +27,10 @@ class BreakIE(InfoExtractor): webpage, 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + youtube_id = info.get('youtubeId') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return { 'id': video_id, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 83eec84d3..3c02c297a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor): object_str = object_str.replace('<--', '