From: remitamine Date: Thu, 3 Dec 2015 17:41:38 +0000 (+0100) Subject: Merge pull request #7681 from remitamine/skynewarabia X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=112ab398dbf9e2b98d13364c233051182e3ed684;hp=f4c7ef98624bd407a8f8f91215cb95a2c5db9472;p=youtube-dl.git Merge pull request #7681 from remitamine/skynewarabia [skynewsarabia] Add new extractor --- diff --git a/AUTHORS b/AUTHORS index f465d20ed..cdb56de3b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -146,3 +146,4 @@ Lukáš Lalinský Qijiang Fan Rémy Léone Marco Ferragina +reiv diff --git a/Makefile b/Makefile index fdb1abb60..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -61,34 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py chmod a+x youtube-dl README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py + COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md - python devscripts/make_contributing.py README.md CONTRIBUTING.md + $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md supportedsites: - python devscripts/make_supportedsites.py docs/supportedsites.md + $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in - python devscripts/bash-completion.py + $(PYTHON) devscripts/bash-completion.py bash-completion: youtube-dl.bash-completion youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in - python devscripts/zsh-completion.py + $(PYTHON) devscripts/zsh-completion.py zsh-completion: youtube-dl.zsh youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in - python devscripts/fish-completion.py + $(PYTHON) devscripts/fish-completion.py fish-completion: youtube-dl.fish diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 92765a3f9..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from ..utils import ( encodeArgument, encodeFilename, sanitize_open, + handle_youtubedl_headers, ) @@ -33,9 +34,10 @@ class HlsFD(FileDownloader): if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) args += [ '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items() if key.lower() != 'accept-encoding')] + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 33b296eaf..7fb80aa38 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -22,7 +22,8 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'[pb][\da-z]{7}' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P%s)' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -465,7 +466,7 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) @@ -780,8 +781,9 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -816,7 +818,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 61bc2f744..1ee4a8b05 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -29,7 +29,7 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + 'http://beeg.com/api/v3/video/%s' % video_id, video_id) formats = [] for format_id, video_url in video.items(): diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 11ace91dd..ebeef8f2a 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/news/[^/]+/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', @@ -20,22 +20,36 @@ class BloombergIE(InfoExtractor): }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, }] def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id') + video_id = self._search_regex( + r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', + webpage, 'id', group='url') title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: - if stream["muxing_format"] == "TS": - formats.extend(self._extract_m3u8_formats(stream['url'], video_id)) + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: - formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb9bfa3d1..6ab2d68d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -167,7 +167,7 @@ class InfoExtractor(object): "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..7b685d157 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,6 +9,7 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE @@ -18,33 +19,32 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,67 +57,77 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) + def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index fd854411b..321eec59e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -164,7 +164,7 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', fatal=False) + webpage, 'alternative title', default=None) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index a6ab795ae..c3f031d9c 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,19 +1,62 @@ from __future__ import unicode_literals -from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + url_basename, +) -class GametrailersIE(MTVServicesInfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _TEST = { - 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', + 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', 'info_dict': { - 'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d', + 'id': '2983958', 'ext': 'mp4', - 'title': 'E3 2013: Debut Trailer', - 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'display_id': '116437-Just-Cause-3-Review', + 'title': 'Just Cause 3 - Review', + 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', }, } - _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_regex( + r'<title>(.+?)\|', webpage, 'title').strip() + embed_url = self._proto_relative_url( + self._search_regex( + r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, + 'embed url'), + scheme='http:') + video_id = url_basename(embed_url) + embed_page = self._download_webpage(embed_url, video_id) + embed_vars_json = self._search_regex( + r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, + 'embed vars') + info = self._parse_json(embed_vars_json, video_id) + + formats = [] + for media in info['media']: + if media['mediaPurpose'] == 'play': + formats.append({ + 'url': media['uri'], + 'height': media['height'], + 'width:': media['width'], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('thumbUri'), + 'description': self._og_search_description(webpage), + 'duration': int_or_none(info.get('videoLengthInSeconds')), + 'age_limit': parse_age_limit(info.get('audienceRating')), + } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 8ac38a174..6ff13050d 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, float_or_none, parse_duration, @@ -48,12 +49,22 @@ class NRKIE(InfoExtractor): 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') - if data['usageRights']['isGeoBlocked']: - raise ExtractorError( - 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - expected=True) + media_url = data.get('mediaUrl') - video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' + if not media_url: + if data['usageRights']['isGeoBlocked']: + raise ExtractorError( + 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + expected=True) + + if determine_ext(media_url) == 'f4m': + formats = self._extract_f4m_formats( + media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') + else: + formats = [{ + 'url': media_url, + 'ext': 'flv', + }] duration = parse_duration(data.get('duration')) @@ -67,12 +78,11 @@ class NRKIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'ext': 'flv', 'title': data['title'], 'description': data['description'], 'duration': duration, 'thumbnail': thumbnail, + 'formats': formats, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 965940a4b..08275687d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -147,7 +147,8 @@ class PornHubPlaylistIE(InfoExtractor): entries = [ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') - for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage)) + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) ] playlist = self._parse_json( diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 5bd3c0087..39a7aaf9d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -58,7 +58,8 @@ class SpiegelIE(InfoExtractor): description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( - r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') + [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], + webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 825172806..59832b1ec 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,14 +1,15 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse, compat_urllib_request, ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, sanitized_Request, ) @@ -18,6 +19,8 @@ class UdemyIE(InfoExtractor): _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' + _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' + _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -33,6 +36,29 @@ class UdemyIE(InfoExtractor): 'skip': 'Requires udemy account credentials', }] + def _enroll_course(self, webpage, course_id): + enroll_url = self._search_regex( + r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', + default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) + webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + if self._SUCCESSFULLY_ENROLLED in webpage: + self.to_screen('%s: Successfully enrolled in' % course_id) + elif self._ALREADY_ENROLLED in webpage: + self.to_screen('%s: Already enrolled in' % course_id) + + def _download_lecture(self, course_id, lecture_id): + return self._download_json( + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( + course_id, lecture_id, compat_urllib_parse.urlencode({ + 'video_only': '', + 'auto_play': '', + 'fields[lecture]': 'title,description,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'instructorPreviewMode': 'False', + })), + lecture_id, 'Downloading lecture JSON') + def _handle_error(self, response): if not isinstance(response, dict): return @@ -54,6 +80,7 @@ class UdemyIE(InfoExtractor): headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value + headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value if isinstance(url_or_request, compat_urllib_request.Request): for header, value in headers.items(): @@ -71,7 +98,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Udemy account is required') + return login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') @@ -109,44 +136,76 @@ class UdemyIE(InfoExtractor): def _real_extract(self, url): lecture_id = self._match_id(url) - lecture = self._download_json( - 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, - lecture_id, 'Downloading lecture JSON') + webpage = self._download_webpage(url, lecture_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id') + + try: + lecture = self._download_lecture(course_id, lecture_id) + except ExtractorError as e: + # Error could possibly mean we are not enrolled in the course + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._enroll_course(webpage, course_id) + lecture_id = self._download_lecture(course_id, lecture_id) + else: + raise + + title = lecture['title'] + description = lecture.get('description') - asset_type = lecture.get('assetType') or lecture.get('asset_type') + asset = lecture['asset'] + + asset_type = asset.get('assetType') or asset.get('asset_type') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - asset = lecture['asset'] - stream_url = asset.get('streamUrl') or asset.get('stream_url') - mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url) - if mobj: - return self.url_result(mobj.group(1), 'Youtube') + if stream_url: + youtube_url = self._search_regex( + r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') - duration = asset['data']['duration'] - - download_url = asset.get('downloadUrl') or asset.get('download_url') - - video = download_url.get('Video') or download_url.get('video') - video_480p = download_url.get('Video480p') or download_url.get('video_480p') - - formats = [ - { - 'url': video_480p[0], - 'format_id': '360p', - }, - { - 'url': video[0], - 'format_id': '720p', - }, - ] - - title = lecture['title'] - description = lecture['description'] + duration = float_or_none(asset.get('data', {}).get('duration')) + outputs = asset.get('data', {}).get('outputs', {}) + + formats = [] + for format_ in asset.get('download_urls', {}).get('Video', []): + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + f.update({ + 'format_id': '%sp' % (output.get('label') or format_id), + 'width': int_or_none(output.get('width')), + 'height': int_or_none(output.get('height')), + 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), + 'vcodec': output.get('video_codec'), + 'fps': int_or_none(output.get('frame_rate')), + 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), + 'acodec': output.get('audio_codec'), + 'asr': int_or_none(output.get('audio_sample_rate')), + 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(output.get('file_size_in_bytes')), + }) + else: + f['format_id'] = '%sp' % format_id + formats.append(f) + + self._sort_formats(formats) return { 'id': video_id, @@ -160,9 +219,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' + _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)' _TESTS = [] @classmethod @@ -170,24 +227,18 @@ class UdemyCourseIE(UdemyIE): return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_path = mobj.group('coursepath') + course_path = self._match_id(url) + + webpage = self._download_webpage(url, course_path) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON') - course_id = int(response['id']) - course_title = response['title'] + course_id = response['id'] + course_title = response.get('title') - webpage = self._download_webpage( - 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, - course_id, 'Enrolling in the course') - - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + self._enroll_course(webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index be0a2780f..357594a11 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + ExtractorError, + sanitized_Request, +) class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', @@ -24,6 +27,12 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '>THIS FILE WAS DELETED<', + '>File Not Found<', + 'The file you were looking for could not be found, sorry for any inconvenience.<')): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cfe9eed55..9b39505ba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -258,7 +258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) @@ -346,6 +346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, @@ -714,6 +715,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', 'only_matching': True, }, + { + # Video with yt:stretch=17:0 + 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', + 'info_dict': { + 'id': 'Q39EVAstoRM', + 'ext': 'mp4', + 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', + 'description': 'md5:ee18a25c350637c8faff806845bddee9', + 'upload_date': '20151107', + 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', + 'uploader': 'CH GAMER DROID', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1459,6 +1480,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + for a_format in formats: + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1496,10 +1520,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', video_webpage) if stretched_m: - ratio = float(stretched_m.group('w')) / float(stretched_m.group('h')) - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + w = float(stretched_m.group('w')) + h = float(stretched_m.group('h')) + # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). + # We will only process correct ratios. + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio self._sort_formats(formats) @@ -1538,7 +1567,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract youtube\.com/ (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?&)*? (?:p|a|list)= + \? (?:.*?[&;])*? (?:p|a|list)= | p/ ) ( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 359e8d300..c46e136bf 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -338,7 +338,7 @@ def parseOpts(overrideArguments=None): video_format.add_option( '-F', '--list-formats', action='store_true', dest='listformats', - help='List all available formats of specified videos') + help='List all available formats of requested videos') video_format.add_option( '--youtube-include-dash-manifest', action='store_true', dest='youtube_include_dash_manifest', default=True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7b737e21..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -663,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -670,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -731,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments