From: remitamine Date: Thu, 3 Dec 2015 19:28:52 +0000 (+0100) Subject: Merge pull request #7659 from remitamine/audimedia X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=24dc1ed715239f85eb3d5f71a707da1dd2bc7773;hp=527ca1da4f778c5ad13ff3c74a8d311adb9aaff3;p=youtube-dl.git Merge pull request #7659 from remitamine/audimedia [audimedia] Add new extractor(closes #7654) --- diff --git a/AUTHORS b/AUTHORS index f465d20ed..cdb56de3b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -146,3 +146,4 @@ Lukáš Lalinský Qijiang Fan Rémy Léone Marco Ferragina +reiv diff --git a/Makefile b/Makefile index fdb1abb60..f826c1685 100644 --- a/Makefile +++ b/Makefile @@ -61,34 +61,34 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py chmod a+x youtube-dl README.md: youtube_dl/*.py youtube_dl/*/*.py - COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py + COLUMNS=80 $(PYTHON) youtube_dl/__main__.py --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md - python devscripts/make_contributing.py README.md CONTRIBUTING.md + $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md supportedsites: - python devscripts/make_supportedsites.py docs/supportedsites.md + $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in - python devscripts/bash-completion.py + $(PYTHON) devscripts/bash-completion.py bash-completion: youtube-dl.bash-completion youtube-dl.zsh: youtube_dl/*.py youtube_dl/*/*.py devscripts/zsh-completion.in - python devscripts/zsh-completion.py + $(PYTHON) devscripts/zsh-completion.py zsh-completion: youtube-dl.zsh youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in - python devscripts/fish-completion.py + $(PYTHON) devscripts/fish-completion.py fish-completion: youtube-dl.fish diff --git a/README.md b/README.md index b85e08e78..df419abe8 100644 --- a/README.md +++ b/README.md @@ -319,7 +319,8 @@ which means you can modify it, redistribute it or use it however you like. --all-formats Download all available video formats --prefer-free-formats Prefer free video formats unless a specific one is requested - -F, --list-formats List all available formats + -F, --list-formats List all available formats of specified + videos --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos --merge-output-format FORMAT If a merge is required (e.g. diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 9a83a73dd..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from ..utils import ( encodeArgument, encodeFilename, sanitize_open, + handle_youtubedl_headers, ) @@ -33,9 +34,10 @@ class HlsFD(FileDownloader): if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) args += [ '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items())] + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f17c22691..62f32f8c8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -555,6 +555,10 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 33b296eaf..7fb80aa38 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -22,7 +22,8 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P[\da-z]{8})' + _ID_REGEX = r'[pb][\da-z]{7}' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P%s)' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -465,7 +466,7 @@ class BBCCoUkIE(InfoExtractor): if not programme_id: programme_id = self._search_regex( - r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None) + r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) @@ -780,8 +781,9 @@ class BBCIE(BBCCoUkIE): # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( - [r'data-video-player-vpid="([\da-z]{8})"', - r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + [r'data-video-player-vpid="(%s)"' % self._ID_REGEX, + r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -816,7 +818,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 61bc2f744..e63c2ac00 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,6 +1,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( int_or_none, parse_iso8601, @@ -29,7 +34,24 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + 'http://beeg.com/api/v3/video/%s' % video_id, video_id) + + def decrypt_key(key): + # Reverse engineered from http://static.beeg.com/cpl/1067.js + a = '8RPUUCS35ZWp3ADnKcSmpH71ZusrROo' + e = compat_urllib_parse_unquote(key) + return ''.join([ + compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 25) + for n in range(len(e))]) + + def decrypt_url(encrypted_url): + encrypted_url = self._proto_relative_url( + encrypted_url.replace('{DATA_MARKERS}', ''), 'http:') + key = self._search_regex( + r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) + if not key: + return encrypted_url + return encrypted_url.replace(key, decrypt_key(key)) formats = [] for format_id, video_url in video.items(): @@ -40,7 +62,7 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'url': decrypt_url(video_url), 'format_id': format_id, 'height': int(height), }) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 6c66a1236..1c3644587 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import itertools import json from .common import InfoExtractor @@ -11,134 +10,108 @@ from ..compat import ( ) from ..utils import ( int_or_none, - unified_strdate, + unescapeHTML, ExtractorError, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P[0-9]+)/' + _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402_part1', + 'id': '1554319', 'ext': 'flv', 'title': '【金坷垃】金泡沫', - 'duration': 308, + 'duration': 308313, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'timestamp': 1397983878, + 'uploader': '菊子桑', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', + 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'uploader': '枫叶逝去', + 'timestamp': 1396501299, }, 'playlist_count': 9, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - if '(此视频不存在或被删除)' in webpage: - raise ExtractorError( - 'The video does not exist or was deleted', expected=True) - - if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: - raise ExtractorError( - 'The video is not available in your region due to copyright reasons', - expected=True) - - video_code = self._search_regex( - r'(?s)
(.*?)
', webpage, 'video code') - - title = self._html_search_meta( - 'media:title', video_code, 'title', fatal=True) - duration_str = self._html_search_meta( - 'duration', video_code, 'duration') - if duration_str is None: - duration = None - else: - duration_mobj = re.match( - r'^T(?:(?P[0-9]+)H)?(?P[0-9]+)M(?P[0-9]+)S$', - duration_str) - duration = ( - int_or_none(duration_mobj.group('hours'), default=0) * 3600 + - int(duration_mobj.group('minutes')) * 60 + - int(duration_mobj.group('seconds'))) - upload_date = unified_strdate(self._html_search_meta( - 'uploadDate', video_code, fatal=False)) - thumbnail = self._html_search_meta( - 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - - cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - - entries = [] - - lq_page = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, - video_id, - note='Downloading LQ video info' + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + page_num = mobj.group('page_num') or '1' + + view_data = self._download_json( + 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), + video_id) + if 'error' in view_data: + raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) + + cid = view_data['cid'] + title = unescapeHTML(view_data['title']) + + page = self._download_webpage( + 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, + cid, + 'Downloading page %s/%s' % (page_num, view_data['pages']) ) try: - err_info = json.loads(lq_page) + err_info = json.loads(page) raise ExtractorError( 'BiliBili said: ' + err_info['error_text'], expected=True) except ValueError: pass - lq_doc = compat_etree_fromstring(lq_page) - lq_durls = lq_doc.findall('./durl') + doc = compat_etree_fromstring(page) - hq_doc = self._download_xml( - 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durls = hq_doc.findall('./durl') - assert len(lq_durls) == len(hq_durls) - else: - hq_durls = itertools.repeat(None) + entries = [] - i = 1 - for lq_durl, hq_durl in zip(lq_durls, hq_durls): + for durl in doc.findall('./durl'): + size = durl.find('./filesize|./size') formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), + 'url': durl.find('./url').text, + 'filesize': int_or_none(size.text) if size else None, + 'ext': 'flv', }] - if hq_durl is not None: - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - self._sort_formats(formats) + backup_urls = durl.find('./backup_url') + if backup_urls is not None: + for backup_url in backup_urls.findall('./url'): + formats.append({'url': backup_url.text}) + formats.reverse() entries.append({ - 'id': '%s_part%d' % (video_id, i), + 'id': '%s_part%s' % (cid, durl.find('./order').text), 'title': title, + 'duration': int_or_none(durl.find('./length').text) // 1000, 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, }) - i += 1 - - return { - '_type': 'multi_video', - 'entries': entries, - 'id': video_id, - 'title': title + info = { + 'id': str(cid), + 'title': title, + 'description': view_data.get('description'), + 'thumbnail': view_data.get('pic'), + 'uploader': view_data.get('author'), + 'timestamp': int_or_none(view_data.get('created')), + 'view_count': view_data.get('play'), + 'duration': int_or_none(doc.find('./timelength').text), } + + if len(entries) == 1: + entries[0].update(info) + return entries[0] + else: + info.update({ + '_type': 'multi_video', + 'id': video_id, + 'entries': entries, + }) + return info diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 11ace91dd..ebeef8f2a 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/news/[^/]+/[^/]+/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', @@ -20,22 +20,36 @@ class BloombergIE(InfoExtractor): }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, + }, { + 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump', + 'only_matching': True, }] def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id') + video_id = self._search_regex( + r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', + webpage, 'id', group='url') title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: - if stream["muxing_format"] == "TS": - formats.extend(self._extract_m3u8_formats(stream['url'], video_id)) + stream_url = stream.get('url') + if not stream_url: + continue + if stream['muxing_format'] == 'TS': + m3u8_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: - formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb9bfa3d1..6ab2d68d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -167,7 +167,7 @@ class InfoExtractor(object): "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..7b685d157 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,6 +9,7 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE @@ -18,33 +19,32 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,67 +57,77 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'
(.*?)(.*?)

' - ], - webpage, 'description', flags=re.DOTALL, default=None) + def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index 212217625..133cdc50b 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -13,8 +13,8 @@ from ..utils import ( class DBTVIE(InfoExtractor): - _VALID_URL = r'http://dbtv\.no/(?P[0-9]+)#(?P.+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:(?:lazyplayer|player)/)?(?P[0-9]+)(?:#(?P.+))?' + _TESTS = [{ 'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', 'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc', 'info_dict': { @@ -30,12 +30,18 @@ class DBTVIE(InfoExtractor): 'view_count': int, 'categories': list, } - } + }, { + 'url': 'http://dbtv.no/3649835190001', + 'only_matching': True, + }, { + 'url': 'http://www.dbtv.no/lazyplayer/4631135248001', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id data = self._download_json( 'http://api.dbtv.no/discovery/%s' % video_id, display_id) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index fd854411b..321eec59e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -164,7 +164,7 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', fatal=False) + webpage, 'alternative title', default=None) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index a6ab795ae..c3f031d9c 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,19 +1,62 @@ from __future__ import unicode_literals -from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + url_basename, +) -class GametrailersIE(MTVServicesInfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _TEST = { - 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', + 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', 'info_dict': { - 'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d', + 'id': '2983958', 'ext': 'mp4', - 'title': 'E3 2013: Debut Trailer', - 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'display_id': '116437-Just-Cause-3-Review', + 'title': 'Just Cause 3 - Review', + 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', }, } - _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_regex( + r'<title>(.+?)\|', webpage, 'title').strip() + embed_url = self._proto_relative_url( + self._search_regex( + r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, + 'embed url'), + scheme='http:') + video_id = url_basename(embed_url) + embed_page = self._download_webpage(embed_url, video_id) + embed_vars_json = self._search_regex( + r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, + 'embed vars') + info = self._parse_json(embed_vars_json, video_id) + + formats = [] + for media in info['media']: + if media['mediaPurpose'] == 'play': + formats.append({ + 'url': media['uri'], + 'height': media['height'], + 'width:': media['width'], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('thumbUri'), + 'description': self._og_search_description(webpage), + 'duration': int_or_none(info.get('videoLengthInSeconds')), + 'age_limit': parse_age_limit(info.get('audienceRating')), + } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 8ac38a174..6ff13050d 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, float_or_none, parse_duration, @@ -48,12 +49,22 @@ class NRKIE(InfoExtractor): 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON') - if data['usageRights']['isGeoBlocked']: - raise ExtractorError( - 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - expected=True) + media_url = data.get('mediaUrl') - video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' + if not media_url: + if data['usageRights']['isGeoBlocked']: + raise ExtractorError( + 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + expected=True) + + if determine_ext(media_url) == 'f4m': + formats = self._extract_f4m_formats( + media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') + else: + formats = [{ + 'url': media_url, + 'ext': 'flv', + }] duration = parse_duration(data.get('duration')) @@ -67,12 +78,11 @@ class NRKIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, - 'ext': 'flv', 'title': data['title'], 'description': data['description'], 'duration': duration, 'thumbnail': thumbnail, + 'formats': formats, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 965940a4b..08275687d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -147,7 +147,8 @@ class PornHubPlaylistIE(InfoExtractor): entries = [ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub') - for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage)) + for video_url in set(re.findall( + r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage)) ] playlist = self._parse_json( diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py new file mode 100644 index 000000000..f09fee102 --- /dev/null +++ b/youtube_dl/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 5bd3c0087..39a7aaf9d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -58,7 +58,8 @@ class SpiegelIE(InfoExtractor): description = self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( - r'var\s+server\s*=\s*"([^"]+)\"', webpage, 'server URL') + [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], + webpage, 'server URL', group='url') xml_url = base_url + video_id + '.xml' idoc = self._download_xml(xml_url, video_id) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 825172806..59832b1ec 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -1,14 +1,15 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse, compat_urllib_request, ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, sanitized_Request, ) @@ -18,6 +19,8 @@ class UdemyIE(InfoExtractor): _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' + _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' + _ALREADY_ENROLLED = '>You are already taking this course.<' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -33,6 +36,29 @@ class UdemyIE(InfoExtractor): 'skip': 'Requires udemy account credentials', }] + def _enroll_course(self, webpage, course_id): + enroll_url = self._search_regex( + r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + webpage, 'enroll url', group='url', + default='https://www.udemy.com/course/subscribe/?courseId=%s' % course_id) + webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') + if self._SUCCESSFULLY_ENROLLED in webpage: + self.to_screen('%s: Successfully enrolled in' % course_id) + elif self._ALREADY_ENROLLED in webpage: + self.to_screen('%s: Already enrolled in' % course_id) + + def _download_lecture(self, course_id, lecture_id): + return self._download_json( + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( + course_id, lecture_id, compat_urllib_parse.urlencode({ + 'video_only': '', + 'auto_play': '', + 'fields[lecture]': 'title,description,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'instructorPreviewMode': 'False', + })), + lecture_id, 'Downloading lecture JSON') + def _handle_error(self, response): if not isinstance(response, dict): return @@ -54,6 +80,7 @@ class UdemyIE(InfoExtractor): headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value + headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value if isinstance(url_or_request, compat_urllib_request.Request): for header, value in headers.items(): @@ -71,7 +98,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Udemy account is required') + return login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') @@ -109,44 +136,76 @@ class UdemyIE(InfoExtractor): def _real_extract(self, url): lecture_id = self._match_id(url) - lecture = self._download_json( - 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, - lecture_id, 'Downloading lecture JSON') + webpage = self._download_webpage(url, lecture_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id') + + try: + lecture = self._download_lecture(course_id, lecture_id) + except ExtractorError as e: + # Error could possibly mean we are not enrolled in the course + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + self._enroll_course(webpage, course_id) + lecture_id = self._download_lecture(course_id, lecture_id) + else: + raise + + title = lecture['title'] + description = lecture.get('description') - asset_type = lecture.get('assetType') or lecture.get('asset_type') + asset = lecture['asset'] + + asset_type = asset.get('assetType') or asset.get('asset_type') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - asset = lecture['asset'] - stream_url = asset.get('streamUrl') or asset.get('stream_url') - mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url) - if mobj: - return self.url_result(mobj.group(1), 'Youtube') + if stream_url: + youtube_url = self._search_regex( + r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') - duration = asset['data']['duration'] - - download_url = asset.get('downloadUrl') or asset.get('download_url') - - video = download_url.get('Video') or download_url.get('video') - video_480p = download_url.get('Video480p') or download_url.get('video_480p') - - formats = [ - { - 'url': video_480p[0], - 'format_id': '360p', - }, - { - 'url': video[0], - 'format_id': '720p', - }, - ] - - title = lecture['title'] - description = lecture['description'] + duration = float_or_none(asset.get('data', {}).get('duration')) + outputs = asset.get('data', {}).get('outputs', {}) + + formats = [] + for format_ in asset.get('download_urls', {}).get('Video', []): + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + f.update({ + 'format_id': '%sp' % (output.get('label') or format_id), + 'width': int_or_none(output.get('width')), + 'height': int_or_none(output.get('height')), + 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), + 'vcodec': output.get('video_codec'), + 'fps': int_or_none(output.get('frame_rate')), + 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), + 'acodec': output.get('audio_codec'), + 'asr': int_or_none(output.get('audio_sample_rate')), + 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(output.get('file_size_in_bytes')), + }) + else: + f['format_id'] = '%sp' % format_id + formats.append(f) + + self._sort_formats(formats) return { 'id': video_id, @@ -160,9 +219,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)' - _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<' - _ALREADY_ENROLLED = '>You are already taking this course.<' + _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)' _TESTS = [] @classmethod @@ -170,24 +227,18 @@ class UdemyCourseIE(UdemyIE): return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_path = mobj.group('coursepath') + course_path = self._match_id(url) + + webpage = self._download_webpage(url, course_path) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON') - course_id = int(response['id']) - course_title = response['title'] + course_id = response['id'] + course_title = response.get('title') - webpage = self._download_webpage( - 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, - course_id, 'Enrolling in the course') - - if self._SUCCESSFULLY_ENROLLED in webpage: - self.to_screen('%s: Successfully enrolled in' % course_id) - elif self._ALREADY_ENROLLED in webpage: - self.to_screen('%s: Already enrolled in' % course_id) + self._enroll_course(webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index be0a2780f..357594a11 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import sanitized_Request +from ..utils import ( + ExtractorError, + sanitized_Request, +) class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', @@ -24,6 +27,12 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if any(p in webpage for p in ( + '>THIS FILE WAS DELETED<', + '>File Not Found<', + 'The file you were looking for could not be found, sorry for any inconvenience.<')): + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cfe9eed55..9b39505ba 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -258,7 +258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) v= ) )) @@ -346,6 +346,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, @@ -714,6 +715,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', 'only_matching': True, }, + { + # Video with yt:stretch=17:0 + 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM', + 'info_dict': { + 'id': 'Q39EVAstoRM', + 'ext': 'mp4', + 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4', + 'description': 'md5:ee18a25c350637c8faff806845bddee9', + 'upload_date': '20151107', + 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA', + 'uploader': 'CH GAMER DROID', + }, + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1459,6 +1480,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + for a_format in formats: + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' else: raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') @@ -1496,10 +1520,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">', video_webpage) if stretched_m: - ratio = float(stretched_m.group('w')) / float(stretched_m.group('h')) - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + w = float(stretched_m.group('w')) + h = float(stretched_m.group('h')) + # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0). + # We will only process correct ratios. + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio self._sort_formats(formats) @@ -1538,7 +1567,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtract youtube\.com/ (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?&)*? (?:p|a|list)= + \? (?:.*?[&;])*? (?:p|a|list)= | p/ ) ( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 079fe7e8a..c46e136bf 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -338,7 +338,7 @@ def parseOpts(overrideArguments=None): video_format.add_option( '-F', '--list-formats', action='store_true', dest='listformats', - help='List all available formats') + help='List all available formats of requested videos') video_format.add_option( '--youtube-include-dash-manifest', action='store_true', dest='youtube_include_dash_manifest', default=True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d7b737e21..d0606b4bc 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -663,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): return hc +def handle_youtubedl_headers(headers): + filtered_headers = headers + + if 'Youtubedl-no-compression' in filtered_headers: + filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + del filtered_headers['Youtubedl-no-compression'] + + return filtered_headers + + class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. @@ -670,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): the standard headers to every HTTP request and handles gzipped and deflated responses from web servers. If compression is to be avoided in a particular request, the original request in the program code only has - to include the HTTP header "Youtubedl-No-Compression", which will be + to include the HTTP header "Youtubedl-no-compression", which will be removed before making the real request. Part of this code was copied from: @@ -731,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # The dict keys are capitalized because of this bug by urllib if h.capitalize() not in req.headers: req.add_header(h, v) - if 'Youtubedl-no-compression' in req.headers: - if 'Accept-encoding' in req.headers: - del req.headers['Accept-encoding'] - del req.headers['Youtubedl-no-compression'] + + req.headers = handle_youtubedl_headers(req.headers) if sys.version_info < (2, 7) and '#' in req.get_full_url(): # Python 2.6 is brain-dead when it comes to fragments diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5ecff39fb..bd0de9f53 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.11.24' +__version__ = '2015.11.27.1'