From: Sergey M․ Date: Thu, 26 Mar 2015 17:57:13 +0000 (+0600) Subject: Merge branch 'master' of https://github.com/zx8/youtube-dl into zx8-master X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=ac0df2350a5ff4fcedd7502df127300361595d7e;hp=32d687f55e103963a2cb8d8f3f88bb31b9cb8fb6;p=youtube-dl.git Merge branch 'master' of https://github.com/zx8/youtube-dl into zx8-master --- diff --git a/AUTHORS b/AUTHORS index 512469f4c..59f1b5f21 100644 --- a/AUTHORS +++ b/AUTHORS @@ -117,3 +117,4 @@ Alexander Mamay Devin J. Pohly Eduardo Ferro Aldama Jeff Buchbinder +Amish Bhadeshia diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 72b365305..baf7b3880 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -231,6 +231,7 @@ - **Letv** - **LetvPlaylist** - **LetvTv** + - **Libsyn** - **lifenews**: LIFE | NEWS - **LiveLeak** - **livestream** @@ -310,6 +311,7 @@ - **npo.nl:radio** - **npo.nl:radio:fragment** - **NRK** + - **NRKPlaylist** - **NRKTV** - **ntv.ru** - **Nuvid** diff --git a/test/test_execution.py b/test/test_execution.py index 60df187de..f31e51558 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# coding: utf-8 + from __future__ import unicode_literals import unittest @@ -27,5 +29,12 @@ class TestExecution(unittest.TestCase): def test_main_exec(self): subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL) + def test_cmdline_umlauts(self): + p = subprocess.Popen( + [sys.executable, 'youtube_dl/__main__.py', 'ä', '--version'], + cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE) + _, stderr = p.communicate() + self.assertFalse(stderr) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index a8ab87685..abaf1ab73 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -200,6 +200,8 @@ class TestUtil(unittest.TestCase): def test_unescape_html(self): self.assertEqual(unescapeHTML('%20;'), '%20;') + self.assertEqual(unescapeHTML('/'), '/') + self.assertEqual(unescapeHTML('/'), '/') self.assertEqual( unescapeHTML('é'), 'é') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3a0c42ded..d56eb6448 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -530,6 +530,10 @@ from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE +from .twentytwotracks import ( + TwentyTwoTracksIE, + TwentyTwoTracksGenreIE +) from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -551,6 +555,7 @@ from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE +from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE from .vgtv import VGTVIE diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 1a241aca7..e369551c2 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -10,7 +10,7 @@ from ..utils import ( class MLBIE(InfoExtractor): - _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?Pn?\d+)' + _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?Pn?\d+)' _TESTS = [ { 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', @@ -80,6 +80,10 @@ class MLBIE(InfoExtractor): 'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553', 'only_matching': True, }, + { + 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'only_matching': True, + } ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 557dffa46..5d8448571 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -231,7 +231,10 @@ class NPOLiveIE(NPOBaseIE): stream_url = self._download_json( stream_info['stream'], display_id, 'Downloading %s URL' % stream_type, - transform_source=strip_jsonp) + 'Unable to download %s URL' % stream_type, + transform_source=strip_jsonp, fatal=False) + if not stream_url: + continue if stream_type == 'hds': f4m_formats = self._extract_f4m_formats(stream_url, display_id) # f4m downloader downloads only piece of live stream diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 846b76c81..d6054d717 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,17 +1,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', + 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', 'info_dict': { 'id': '66418', 'ext': 'mp4', - "title": "Sucked on a toilet", - "age_limit": 18, + 'title': 'Sucked on a toilet', + 'age_limit': 18, } } @@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): + raise ExtractorError('Video %s has been removed' % video_id, expected=True) + video_url = self._html_search_regex( r'', webpage, 'video URL') video_title = self._html_search_regex( diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 9f79ff5c1..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=', + r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7cb06f351..a46a7ecba 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -54,7 +54,7 @@ class TeamcocoIE(InfoExtractor): embed_url, video_id, 'Downloading embed page') player_data = self._parse_json(self._search_regex( - r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id) + r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id) data = self._parse_json( base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id) diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py new file mode 100644 index 000000000..d6c0ab184 --- /dev/null +++ b/youtube_dl/extractor/twentytwotracks.py @@ -0,0 +1,86 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + +# 22Tracks regularly replace the audio tracks that can be streamed on their +# site. The tracks usually expire after 1 months, so we can't add tests. + + +class TwentyTwoTracksIE(InfoExtractor): + _VALID_URL = r'https?://22tracks\.com/(?P[a-z]+)/(?P[\da-z]+)/(?P\d+)' + IE_NAME = '22tracks:track' + + _API_BASE = 'http://22tracks.com/api' + + def _extract_info(self, city, genre_name, track_id=None): + item_id = track_id if track_id else genre_name + + cities = self._download_json( + '%s/cities' % self._API_BASE, item_id, + 'Downloading cities info', + 'Unable to download cities info') + city_id = [x['id'] for x in cities if x['slug'] == city][0] + + genres = self._download_json( + '%s/genres/%s' % (self._API_BASE, city_id), item_id, + 'Downloading %s genres info' % city, + 'Unable to download %s genres info' % city) + genre = [x for x in genres if x['slug'] == genre_name][0] + genre_id = genre['id'] + + tracks = self._download_json( + '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, + 'Downloading %s genre tracks info' % genre_name, + 'Unable to download track info') + + return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] + + def _get_track_url(self, filename, track_id): + token = self._download_json( + 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, + track_id, 'Downloading token', 'Unable to download token') + return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) + + def _extract_track_info(self, track_info, track_id): + download_url = self._get_track_url(track_info['filename'], track_id) + title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) + return { + 'id': track_id, + 'url': download_url, + 'ext': 'mp3', + 'title': title, + 'duration': int_or_none(track_info.get('duration')), + 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + city = mobj.group('city') + genre = mobj.group('genre') + track_id = mobj.group('id') + + track_info = self._extract_info(city, genre, track_id) + return self._extract_track_info(track_info, track_id) + + +class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): + _VALID_URL = r'https?://22tracks\.com/(?P[a-z]+)/(?P[\da-z]+)/?$' + IE_NAME = '22tracks:genre' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + city = mobj.group('city') + genre = mobj.group('genre') + + genre_title, tracks = self._extract_info(city, genre) + + entries = [ + self._extract_track_info(track_info, track_info['id']) + for track_info in tracks] + + return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py new file mode 100644 index 000000000..123d9470e --- /dev/null +++ b/youtube_dl/extractor/vessel.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class VesselIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P[0-9a-zA-Z]+)' + _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' + _LOGIN_URL = 'https://www.vessel.com/api/account/login' + _NETRC_MACHINE = 'vessel' + _TEST = { + 'url': 'https://www.vessel.com/videos/HDN7G5UMs', + 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', + 'info_dict': { + 'id': 'HDN7G5UMs', + 'ext': 'mp4', + 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150317', + 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', + 'timestamp': int, + }, + } + + @staticmethod + def make_json_request(url, data): + payload = json.dumps(data).encode('utf-8') + req = compat_urllib_request.Request(url, payload) + req.add_header('Content-Type', 'application/json; charset=utf-8') + return req + + @staticmethod + def find_assets(data, asset_type): + for asset in data.get('assets', []): + if asset.get('type') == asset_type: + yield asset + + def _check_access_rights(self, data): + access_info = data.get('__view', {}) + if access_info.get('allow_access') == False: + err_code = access_info.get('error_code') or '' + if err_code == 'ITEM_PAID_ONLY': + raise ExtractorError( + 'This video requires subscription.', expected=True) + else: + raise ExtractorError( + 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + data = { + 'client_id': 'web', + 'type': 'password', + 'user_key': username, + 'password': password, + } + login_request = VesselIE.make_json_request(self._LOGIN_URL, data) + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) + asset_id = data['model']['data']['id'] + + req = VesselIE.make_json_request( + self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) + data = self._download_json(req, video_id) + + self._check_access_rights(data) + + try: + video_asset = next(VesselIE.find_assets(data, 'video')) + except StopIteration: + raise ExtractorError('No video assets found') + + formats = [] + for f in video_asset.get('sources', []): + if f['name'] == 'hls-index': + formats.extend(self._extract_m3u8_formats( + f['location'], video_id, ext='mp4', m3u8_id='m3u8')) + else: + formats.append({ + 'format_id': f['name'], + 'tbr': f.get('bitrate'), + 'height': f.get('height'), + 'width': f.get('width'), + 'url': f['location'], + }) + self._sort_formats(formats) + + thumbnails = [] + for im_asset in VesselIE.find_assets(data, 'image'): + thumbnails.append({ + 'url': im_asset['location'], + 'width': im_asset.get('width', 0), + 'height': im_asset.get('height', 0), + }) + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'thumbnails': thumbnails, + 'description': data.get('short_description'), + 'duration': data.get('duration'), + 'comment_count': data.get('comment_count'), + 'like_count': data.get('like_count'), + 'view_count': data.get('view_count'), + 'timestamp': parse_iso8601(data.get('released_at')), + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27c8c4453..eba699c3a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - - if playlist_id.startswith('RD') or playlist_id.startswith('UL'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - + def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page @@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, 'Youtube', video_id=video_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + + return self._extract_playlist(playlist_id) + class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' @@ -1643,21 +1646,27 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:recommended' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' _FEED_NAME = 'watch_later' _PLAYLIST_TITLE = 'Youtube Watch Later' _PERSONAL_FEED = True + def _real_extract(self, url): + return self._extract_playlist('WL') + class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 4e6e47d6f..35c7e5fb3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -794,6 +794,11 @@ def parseOpts(overrideArguments=None): write_string('[debug] Override config: ' + repr(overrideArguments) + '\n') else: command_line_conf = sys.argv[1:] + # Workaround for Python 2.x, where argv is a byte list + if sys.version_info < (3,): + command_line_conf = [ + a.decode('utf-8', 'replace') for a in command_line_conf] + if '--ignore-config' in command_line_conf: system_conf = [] user_conf = [] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 472d4df41..245d623d8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -348,7 +348,7 @@ def _htmlentity_transform(entity): if entity in compat_html_entities.name2codepoint: return compat_chr(compat_html_entities.name2codepoint[entity]) - mobj = re.match(r'#(x?[0-9]+)', entity) + mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith('x'): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 51b4260aa..039ceadf2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.03.18' +__version__ = '2015.03.24'