From: Sergey M. Date: Mon, 29 Dec 2014 14:21:39 +0000 (+0600) Subject: Merge pull request #4543 from akretz/cnn_fix X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=commitdiff_plain;h=974739aab5935ddf19c005ea084c83a45c855348;hp=9532d723715f1d1a08c126ba8af6940113b88d3b;p=youtube-dl.git Merge pull request #4543 from akretz/cnn_fix [cnn] Add support for articles with videos (fixes #4541) --- diff --git a/AUTHORS b/AUTHORS index 6ea958fce..bb4d8b4d1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -95,3 +95,4 @@ Adrian Kretz Mathias Rav Petr Kutalek Will Glynn +Max Reimann diff --git a/test/helper.py b/test/helper.py index 8a820526a..96d58b7c1 100644 --- a/test/helper.py +++ b/test/helper.py @@ -99,7 +99,7 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, expected_dict, got_dict): +def expect_info_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): if isinstance(expected, compat_str) and expected.startswith('re:'): got = got_dict.get(info_field) diff --git a/test/test_download.py b/test/test_download.py index a009aa475..412f3dbce 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -155,7 +155,7 @@ def generator(test_case): if is_playlist: self.assertEqual(res_dict['_type'], 'playlist') self.assertTrue('entries' in res_dict) - expect_info_dict(self, test_case.get('info_dict', {}), res_dict) + expect_info_dict(self, res_dict, test_case.get('info_dict', {})) if 'playlist_mincount' in test_case: assertGreaterEqual( @@ -204,7 +204,7 @@ def generator(test_case): with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) - expect_info_dict(self, tc.get('info_dict', {}), info_dict) + expect_info_dict(self, info_dict, tc.get('info_dict', {})) finally: try_rm_tcs_files() if is_playlist and res_dict is not None and res_dict.get('entries'): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ba12e3263..ab0f76862 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -7,6 +7,7 @@ from .adobetv import AdobeTVIE from .adultswim import AdultSwimIE from .aftonbladet import AftonbladetIE from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE from .anitube import AnitubeIE from .anysex import AnySexIE from .aol import AolIE @@ -109,6 +110,7 @@ from .elpais import ElPaisIE from .empflix import EMPFlixIE from .engadget import EngadgetIE from .eporner import EpornerIE +from .eroprofile import EroProfileIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE @@ -407,6 +409,7 @@ from .ted import TEDIE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telemb import TeleMBIE +from .teletask import TeleTaskIE from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py new file mode 100644 index 000000000..c34719d1f --- /dev/null +++ b/youtube_dl/extractor/alphaporno.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + parse_filesize, + int_or_none, +) + + +class AlphaPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P[^/]+)' + _TEST = { + 'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/', + 'md5': 'feb6d3bba8848cd54467a87ad34bd38e', + 'info_dict': { + 'id': '258807', + 'display_id': 'sensual-striptease-porn-with-samantha-alexandra', + 'ext': 'mp4', + 'title': 'Sensual striptease porn with Samantha Alexandra', + 'thumbnail': 're:https?://.*\.jpg$', + 'timestamp': 1418694611, + 'upload_date': '20141216', + 'duration': 387, + 'filesize_approx': 54120000, + 'tbr': 1145, + 'categories': list, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None) + + video_url = self._search_regex( + r"video_url\s*:\s*'([^']+)'", webpage, 'video url') + ext = self._html_search_meta( + 'encodingFormat', webpage, 'ext', default='.mp4')[1:] + + title = self._search_regex( + [r'', + r'class="title" itemprop="name">([^<]+)<'], + webpage, 'title') + thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail') + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage, 'upload date')) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration')) + filesize_approx = parse_filesize(self._html_search_meta( + 'contentSize', webpage, 'file size')) + bitrate = int_or_none(self._html_search_meta( + 'bitrate', webpage, 'bitrate')) + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'filesize_approx': filesize_approx, + 'tbr': bitrate, + 'categories': categories, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 34ce8429b..9fc35a42b 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,42 +1,48 @@ from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor -from ..utils import ( - unified_strdate, -) +from ..utils import unified_strdate class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P[^?/]+)(?:[?].*)?$' - _TEST = { - "url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", - 'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', + _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P[^?/]+)(?:[?].*)?$' + _TESTS = [{ + 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', 'info_dict': { - "title": "1968 Demo - FJCC Conference Presentation Reel #1", - "description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also Doug's 1968 Demo page for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | Reel 2 | Reel 3", - "upload_date": "19681210", - "uploader": "SRI International" + 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'ext': 'ogv', + 'title': '1968 Demo - FJCC Conference Presentation Reel #1', + 'description': 'md5:1780b464abaca9991d8968c877bb53ed', + 'upload_date': '19681210', + 'uploader': 'SRI International' + } + }, { + 'url': 'https://archive.org/details/Cops1922', + 'md5': '18f2a19e6d89af8425671da1cf3d4e04', + 'info_dict': { + 'id': 'Cops1922', + 'ext': 'ogv', + 'title': 'Buster Keaton\'s "Cops" (1922)', + 'description': 'md5:70f72ee70882f713d4578725461ffcc3', } - } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) json_url = url + ('?' if '?' in url else '&') + 'output=json' - json_data = self._download_webpage(json_url, video_id) - data = json.loads(json_data) + data = self._download_json(json_url, video_id) + + def get_optional(data_dict, field): + return data_dict['metadata'].get(field, [None])[0] - title = data['metadata']['title'][0] - description = data['metadata']['description'][0] - uploader = data['metadata']['creator'][0] - upload_date = unified_strdate(data['metadata']['date'][0]) + title = get_optional(data, 'title') + description = get_optional(data, 'description') + uploader = get_optional(data, 'creator') + upload_date = unified_strdate(get_optional(data, 'date')) formats = [ { diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 219631b9b..929dd3cc5 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -37,7 +37,7 @@ class ArteTvIE(InfoExtractor): config_xml_url, video_id, note='Downloading configuration') formats = [{ - 'forma_id': q.attrib['quality'], + 'format_id': q.attrib['quality'], # The playpath starts at 'mp4:', if we don't manually # split the url, rtmpdump will incorrectly parse them 'url': q.text.split('mp4:', 1)[0], @@ -133,7 +133,7 @@ class ArteTVPlus7IE(InfoExtractor): 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), - 'quality': qfunc(f['quality']), + 'quality': qfunc(f.get('quality')), 'source_preference': source_pref, } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 2d2f742ae..f690dc803 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -71,7 +71,20 @@ class BBCCoUkIE(SubtitlesInfoExtractor): 'skip_download': True, }, 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', - }, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', + 'info_dict': { + 'id': 'b04v209v', + 'ext': 'flv', + 'title': 'Pete Tong, The Essential New Tune Special', + 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!", + 'duration': 10800, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + } ] def _extract_asx_playlist(self, connection, programme_id): @@ -203,6 +216,59 @@ class BBCCoUkIE(SubtitlesInfoExtractor): return formats, subtitles + def _download_playlist(self, playlist_id): + try: + playlist = self._download_json( + 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, + playlist_id, 'Downloading playlist JSON') + + version = playlist.get('defaultAvailableVersion') + if version: + smp_config = version['smpConfig'] + title = smp_config['title'] + description = smp_config['summary'] + for item in smp_config['items']: + kind = item['kind'] + if kind != 'programme' and kind != 'radioProgramme': + continue + programme_id = item.get('vpid') + duration = int(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + return programme_id, title, description, duration, formats, subtitles + except ExtractorError as ee: + if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + raise + + # fallback to legacy playlist + playlist = self._download_xml( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, + playlist_id, 'Downloading legacy playlist XML') + + no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') + if no_items is not None: + reason = no_items.get('reason') + if reason == 'preAvailability': + msg = 'Episode %s is not yet available' % playlist_id + elif reason == 'postAvailability': + msg = 'Episode %s is no longer available' % playlist_id + elif reason == 'noMedia': + msg = 'Episode %s is not currently available' % playlist_id + else: + msg = 'Episode %s is not available: %s' % (playlist_id, reason) + raise ExtractorError(msg, expected=True) + + for item in self._extract_items(playlist): + kind = item.get('kind') + if kind != 'programme' and kind != 'radioProgramme': + continue + title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text + description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text + programme_id = item.get('identifier') + duration = int(item.get('duration')) + formats, subtitles = self._download_media_selector(programme_id) + + return programme_id, title, description, duration, formats, subtitles + def _real_extract(self, url): group_id = self._match_id(url) @@ -219,32 +285,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor): duration = player['duration'] formats, subtitles = self._download_media_selector(programme_id) else: - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, - group_id, 'Downloading playlist XML') - - no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') - if no_items is not None: - reason = no_items.get('reason') - if reason == 'preAvailability': - msg = 'Episode %s is not yet available' % group_id - elif reason == 'postAvailability': - msg = 'Episode %s is no longer available' % group_id - elif reason == 'noMedia': - msg = 'Episode %s is not currently available' % group_id - else: - msg = 'Episode %s is not available: %s' % (group_id, reason) - raise ExtractorError(msg, expected=True) - - for item in self._extract_items(playlist): - kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': - continue - title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text - description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') - duration = int(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) + programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) if self._downloader.params.get('listsubtitles', False): self._list_available_subtitles(programme_id, subtitles) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py new file mode 100644 index 000000000..79e2fbd39 --- /dev/null +++ b/youtube_dl/extractor/eroprofile.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class EroProfileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P[^/]+)' + _TEST = { + 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', + 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', + 'info_dict': { + 'id': '3733775', + 'display_id': 'sexy-babe-softcore', + 'ext': 'm4v', + 'title': 'sexy babe softcore', + 'thumbnail': 're:https?://.*\.jpg', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], + webpage, 'video id', default=None) + + video_url = self._search_regex( + r'([^<]+)', webpage, 'title') + thumbnail = self._search_regex( + r'onclick="showVideoPlayer\(\)">\d+)' - _TEST = { - 'url': 'http://www.gameone.de/tv/288', - 'md5': '136656b7fb4c9cb4a8e2d500651c499b', - 'info_dict': { - 'id': '288', - 'ext': 'mp4', - 'title': 'Game One - Folge 288', - 'duration': 1238, - 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg', - 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1', - 'age_limit': 16, - 'upload_date': '20140513', - 'timestamp': 1399980122, + _TESTS = [ + { + 'url': 'http://www.gameone.de/tv/288', + 'md5': '136656b7fb4c9cb4a8e2d500651c499b', + 'info_dict': { + 'id': '288', + 'ext': 'mp4', + 'title': 'Game One - Folge 288', + 'duration': 1238, + 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg', + 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1', + 'age_limit': 16, + 'upload_date': '20140513', + 'timestamp': 1399980122, + } + }, + { + 'url': 'http://gameone.de/tv/220', + 'md5': '5227ca74c4ae6b5f74c0510a7c48839e', + 'info_dict': { + 'id': '220', + 'ext': 'mp4', + 'upload_date': '20120918', + 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker', + 'timestamp': 1347971451, + 'title': 'Game One - Folge 220', + 'duration': 896.62, + 'age_limit': 16, + } } - } + + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -66,13 +85,13 @@ class GameOneIE(InfoExtractor): video_id, 'Downloading media:content') rendition_items = content.findall('.//rendition') - duration = int(rendition_items[0].get('duration')) + duration = float_or_none(rendition_items[0].get('duration')) formats = [ { 'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text), - 'width': int(r.get('width')), - 'height': int(r.get('height')), - 'tbr': int(r.get('bitrate')), + 'width': int_or_none(r.get('width')), + 'height': int_or_none(r.get('height')), + 'tbr': int_or_none(r.get('bitrate')), } for r in rendition_items ] @@ -105,7 +124,8 @@ class GameOnePlaylistIE(InfoExtractor): webpage = self._download_webpage('http://www.gameone.de/tv', 'TV') max_id = max(map(int, re.findall(r'(.+?)', - webpage, 'video title') + raw_title = self._html_search_regex( + r'(?s)(.+?)', + webpage, 'video title') title = raw_title.partition('-')[0].strip() - vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage, - 'video path') - data = _fetch_data(vid, mytv) - - QUALITIES = ('ori', 'super', 'high', 'nor') - vid_ids = [data['data'][q + 'Vid'] - for q in QUALITIES - if data['data'][q + 'Vid'] != 0] - if not vid_ids: - raise ExtractorError('No formats available for this video') + vid = self._html_search_regex( + r'var vid ?= ?["\'](\d+)["\']', + webpage, 'video path') + vid_data = _fetch_data(vid, mytv) - # For now, we just pick the highest available quality - vid_id = vid_ids[-1] + formats_json = {} + for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): + vid_id = vid_data['data'].get('%sVid' % format_id) + if not vid_id: + continue + vid_id = compat_str(vid_id) + formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) - format_data = data if vid == vid_id else _fetch_data(vid_id, mytv) - part_count = format_data['data']['totalBlocks'] - allot = format_data['allot'] - prot = format_data['prot'] - clipsURL = format_data['data']['clipsURL'] - su = format_data['data']['su'] + part_count = vid_data['data']['totalBlocks'] playlist = [] for i in range(part_count): - part_url = ('http://%s/?prot=%s&file=%s&new=%s' % - (allot, prot, clipsURL[i], su[i])) - part_str = self._download_webpage( - part_url, video_id, - note='Downloading part %d of %d' % (i + 1, part_count)) - - part_info = part_str.split('|') - video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) - - video_info = { - 'id': '%s_part%02d' % (video_id, i + 1), + formats = [] + for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + prot = format_data['prot'] + + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + part_str = self._download_webpage( + 'http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clips_url[i], su[i]), + video_id, + 'Downloading %s video URL part %d of %d' + % (format_id, i + 1, part_count)) + + part_info = part_str.split('|') + video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3]) + + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'filesize': data['clipsBytes'][i], + 'width': data['width'], + 'height': data['height'], + 'fps': data['fps'], + }) + self._sort_formats(formats) + + playlist.append({ + 'id': '%s_part%d' % (video_id, i + 1), 'title': title, - 'url': video_url, - 'ext': 'mp4', - } - playlist.append(video_info) + 'duration': vid_data['data']['clipsDuration'][i], + 'formats': formats, + }) if len(playlist) == 1: info = playlist[0] diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 2f57f5b7c..1a57aebf1 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -60,9 +60,10 @@ class SportDeutschlandIE(InfoExtractor): categories = list(data.get('section', {}).get('tags', {}).values()) asset = data['asset'] + assets_info = self._download_json(asset['url'], video_id) formats = [] - smil_url = asset['video'] + smil_url = assets_info['video'] if '.smil' in smil_url: m3u8_url = smil_url.replace('.smil', '.m3u8') formats.extend( diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 263f09b46..8a333f1d2 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -28,23 +28,27 @@ class SunPornoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'([^<]+)', webpage, 'title') - description = self._html_search_meta('description', webpage, 'description') + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False)) + r'itemprop="duration">\s*(\d+:\d+)\s*<', + webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False)) + r'class="views">\s*(\d+)\s*<', + webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'(\d+) Comments?', webpage, 'comment count', fatal=False)) + r'(\d+) Comments?', + webpage, 'comment count', fatal=False)) formats = [] quality = qualities(['mp4', 'flv']) diff --git a/youtube_dl/extractor/teletask.py b/youtube_dl/extractor/teletask.py new file mode 100644 index 000000000..e54145105 --- /dev/null +++ b/youtube_dl/extractor/teletask.py @@ -0,0 +1,53 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class TeleTaskIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.tele-task.de/archive/video/html5/26168/', + 'info_dict': { + 'title': 'Duplicate Detection', + }, + 'playlist': [{ + 'md5': '290ef69fb2792e481169c3958dbfbd57', + 'info_dict': { + 'id': '26168-speaker', + 'ext': 'mp4', + 'title': 'Duplicate Detection', + 'upload_date': '20141218', + } + }, { + 'md5': 'e1e7218c5f0e4790015a437fcf6c71b4', + 'info_dict': { + 'id': '26168-slides', + 'ext': 'mp4', + 'title': 'Duplicate Detection', + 'upload_date': '20141218', + } + }] + } + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + title = self._html_search_regex( + r'itemprop="name">([^<]+)', webpage, 'title') + upload_date = unified_strdate(self._html_search_regex( + r'Date:([^<]+)', webpage, 'date', fatal=False)) + + entries = [{ + 'id': '%s-%s' % (lecture_id, format_id), + 'url': video_url, + 'title': title, + 'upload_date': upload_date, + } for format_id, video_url in re.findall( + r'