X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fnbc.py;h=6b7da114961c2e355cddfec59a573bcbe3211b71;hb=62666af99fb55e3ba535ce630e8ce0aed1b5b0e8;hp=e67025ff6daa9b366b880a15fe167db266c00266;hpb=79ba9140dc8fcf5883b7473596e8f20cba6b479f;p=youtube-dl.git diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e67025ff6..6b7da1149 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -67,6 +67,23 @@ class NBCIE(InfoExtractor): # This video has expired but with an escaped embedURL 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', 'only_matching': True, + }, + { + # HLS streams requires the 'hdnea3' cookie + 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', + 'info_dict': { + 'id': 'n1806', + 'ext': 'mp4', + 'title': 'Goliath', + 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', + 'timestamp': 1237100400, + 'upload_date': '20090315', + 'uploader': 'NBCU-COM', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from US', } ] @@ -134,6 +151,9 @@ class NBCSportsIE(InfoExtractor): 'ext': 'flv', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20150330', + 'timestamp': 1427726529, } } @@ -172,7 +192,7 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ (?:video/.+?/(?P\d+)| ([^/]+/)*(?P[^/?]+)) ''' @@ -230,10 +250,27 @@ class NBCNewsIE(ThePlatformIE): }, 'expected_warnings': ['http-6000 is not available'] }, + { + 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', + 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'info_dict': { + 'id': '669831235788', + 'ext': 'mp4', + 'title': 'See the aurora borealis from space in stunning new NASA video', + 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', + 'upload_date': '20160420', + 'timestamp': 1461152093, + }, + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, }, + { + # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html + 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -257,15 +294,17 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, display_id) info = None bootstrap_json = self._search_regex( - r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], webpage, 'bootstrap json', default=None) - if bootstrap_json: - bootstrap = self._parse_json(bootstrap_json, display_id) + bootstrap = self._parse_json( + bootstrap_json, display_id, transform_source=unescapeHTML) + if 'results' in bootstrap: info = bootstrap['results'][0]['video'] + elif 'video' in bootstrap: + info = bootstrap['video'] else: - player_instance_json = self._search_regex( - r'videoObj\s*:\s*({.+})', webpage, 'player instance') - info = self._parse_json(player_instance_json, display_id) + info = bootstrap video_id = info['mpxId'] title = info['title'] @@ -295,7 +334,7 @@ class NBCNewsIE(ThePlatformIE): formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) else: - tbr = int_or_none(video_asset.get('bitRate'), 1000) + tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) format_id = 'http%s' % ('-%d' % tbr if tbr else '') video_url = update_url_query( video_url, {'format': 'redirect'}) @@ -321,10 +360,9 @@ class NBCNewsIE(ThePlatformIE): 'id': video_id, 'title': title, 'description': info.get('description'), - 'thumbnail': info.get('description'), 'thumbnail': info.get('thumbnail'), 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate')), + 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), 'formats': formats, 'subtitles': subtitles, }