X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=41884ed7a5b89e021eb6515dc8b8accdd2816136;hb=cb454b333d91718a0c2b36c34c8b0d6858ff9505;hp=328301de396e5dd289b139808754ef20e1af652b;hpb=4e262a8838d487362a85eb8b8693d2fa84899f17;p=youtube-dl.git diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 328301de3..41884ed7a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,12 +17,14 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + is_html, orderedSet, parse_xml, smuggle_url, unescapeHTML, unified_strdate, unsmuggle_url, + UnsupportedError, url_basename, ) from .brightcove import BrightcoveIE @@ -130,12 +132,13 @@ class GenericIE(InfoExtractor): # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', + 'md5': '166dd577b433b4d4ebfee10b0824d8ff', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get }, + 'add_ie': ['Ooyala'], }, # google redirect { @@ -145,7 +148,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', - 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.', + 'description': 're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, @@ -180,6 +183,14 @@ class GenericIE(InfoExtractor): 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, + # BBC iPlayer embeds + { + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', + 'info_dict': { + 'title': 'BBC - Blogs - Adam Curtis - BUGGER', + }, + 'playlist_mincount': 18, + }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', @@ -351,7 +362,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', - 'description': 're:' + 'description': 're:.*groundbreaking video review series.*' }, 'playlist_mincount': 11, }, @@ -467,8 +478,40 @@ class GenericIE(InfoExtractor): 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] + }, + # Cinchcast embed + { + 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', + 'info_dict': { + 'id': '7141703', + 'ext': 'mp3', + 'upload_date': '20141126', + 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', + } + }, + # Cinerama player + { + 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm', + 'info_dict': { + 'id': '730m_DandD_1901_512k', + 'ext': 'mp4', + 'uploader': 'www.abc.net.au', + 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015', + } + }, + # embedded viddler video + { + 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597', + 'info_dict': { + 'id': '4d03aad9', + 'ext': 'mp4', + 'uploader': 'deadspin', + 'title': 'WALL-TO-GORTAT', + 'timestamp': 1422285291, + 'upload_date': '20150126', + }, + 'add_ie': ['Viddler'], } - ] def report_following_redirect(self, new_url): @@ -628,7 +671,7 @@ class GenericIE(InfoExtractor): # Maybe it's a direct link to a video? # Be careful not to download the whole thing! first_bytes = full_response.read(512) - if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): + if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') upload_date = unified_strdate( @@ -689,9 +732,9 @@ class GenericIE(InfoExtractor): r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') # Helper method - def _playlist_from_matches(matches, getter, ie=None): + def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m)), ie) + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -830,9 +873,16 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for embedded Viddler player + mobj = re.search( + r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Ooyala videos - mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or - re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage)) + mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or + re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: return OoyalaIE._build_url_result(mobj.group('ec')) @@ -895,6 +945,11 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, getter=unescapeHTML, ie='FunnyOrDie') + # Look for BBC iPlayer embed + matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) + if matches: + return _playlist_from_matches(matches, ie='BBCCoUk') + # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) if rutv_url: @@ -902,7 +957,7 @@ class GenericIE(InfoExtractor): # Look for embedded TED player mobj = re.search( - r']+?src=(["\'])(?Phttp://embed\.ted\.com/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'TED') @@ -962,6 +1017,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') + # Look for embedded Cinchcast player + mobj = re.search( + r']+?src=(["\'])(?Phttps?://player\.cinchcast\.com/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Cinchcast') + mobj = re.search( r']+?src=(["\'])(?Phttps?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', webpage) @@ -1014,6 +1076,10 @@ class GenericIE(InfoExtractor): \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) + if not found: + # Cinerama player + found = re.findall( + r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if not found: # Try to find twitter cards info found = filter_video(re.findall( @@ -1041,7 +1107,7 @@ class GenericIE(InfoExtractor): 'url': new_url, } if not found: - raise ExtractorError('Unsupported URL: %s' % url) + raise UnsupportedError(url) entries = [] for video_url in found: