X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fgeneric.py;h=dc5755d1240a493ba7a9ab85e16c67ab4881edac;hb=ed9a25dd612fb06d9cf007a6491ac9982535a8f9;hp=fbbc79a574ca03f1e483738a726f2fde0bf6b21d;hpb=f076b63821100af17379ed8b470e508846bb8432;p=youtube-dl.git diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fbbc79a57..dc5755d12 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -26,6 +26,7 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_basename, + xpath_text, ) from .brightcove import BrightcoveIE from .ooyala import OoyalaIE @@ -473,6 +474,7 @@ class GenericIE(InfoExtractor): { 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', 'info_dict': { + 'id': '1986', 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', }, 'playlist_mincount': 2, @@ -524,6 +526,98 @@ class GenericIE(InfoExtractor): 'upload_date': '20150126', }, 'add_ie': ['Viddler'], + }, + # jwplayer YouTube + { + 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', + 'info_dict': { + 'id': 'Mrj4DVp2zeA', + 'ext': 'mp4', + 'upload_date': '20150212', + 'uploader': 'The National Archives UK', + 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', + 'uploader_id': 'NationalArchives08', + 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', + }, + }, + # rtl.nl embed + { + 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'aanslagen-kopenhagen', + 'title': 'Aanslagen Kopenhagen | RTL Nieuws', + } + }, + # Zapiks embed + { + 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html', + 'info_dict': { + 'id': '118046', + 'ext': 'mp4', + 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', + } + }, + # Kaltura embed + { + 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', + 'info_dict': { + 'id': '1_eergr3h1', + 'ext': 'mp4', + 'upload_date': '20150226', + 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', + 'timestamp': int, + 'title': 'John Carlson Postgame 2/25/15', + }, + }, + # Eagle.Platform embed (generic URL) + { + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + }, + # ClipYou (Eagle.Platform) embed (custom URL) + { + 'url': 'http://muz-tv.ru/play/7129/', + 'info_dict': { + 'id': '12820', + 'ext': 'mp4', + 'title': "'O Sole Mio", + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 216, + 'view_count': int, + }, + }, + # Pladform embed + { + 'url': 'http://muz-tv.ru/kinozal/view/7400/', + 'info_dict': { + 'id': '100183293', + 'ext': 'mp4', + 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', + 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 694, + 'age_limit': 0, + }, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } } ] @@ -536,11 +630,24 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - entries = [{ - '_type': 'url', - 'url': e.find('link').text, - 'title': e.find('title').text, - } for e in doc.findall('./channel/item')] + entries = [] + for it in doc.findall('./channel/item'): + next_url = xpath_text(it, 'link', fatal=False) + if not next_url: + enclosure_nodes = it.findall('./enclosure') + for e in enclosure_nodes: + next_url = e.attrib.get('url') + if next_url: + break + + if not next_url: + continue + + entries.append({ + '_type': 'url', + 'url': next_url, + 'title': it.find('title').text, + }) return { '_type': 'playlist', @@ -769,6 +876,13 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for embedded rtl.nl player + matches = re.findall( + r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) @@ -776,7 +890,6 @@ class GenericIE(InfoExtractor): player_url = unescapeHTML(mobj.group('url')) surl = smuggle_url(player_url, {'Referer': url}) return self.url_result(surl) - # Look for embedded (swf embed) Vimeo player mobj = re.search( r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) @@ -1034,7 +1147,12 @@ class GenericIE(InfoExtractor): # Look for embedded sbs.com.au player mobj = re.search( - r']+?src=(["\'])(?Phttps?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1', + r'''(?x) + (?: + ]+?src= + ) + (["\'])(?Phttps?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') @@ -1064,7 +1182,39 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') + # Look for Zapiks embed + mobj = re.search( + r']+src="(?Phttps?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Zapiks') + + # Look for Kaltura embeds + mobj = re.search( + r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P[^']+)',.*?'entry_id'\s*:\s*'(?P[^']+)',", webpage) + if mobj is not None: + return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura') + + # Look for Eagle.Platform embeds + mobj = re.search( + r']+src="(?Phttps?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'EaglePlatform') + + # Look for ClipYou (uses Eagle.Platform) embeds + mobj = re.search( + r']+src="https?://(?Pmedia\.clipyou\.ru)/index/player\?.*\brecord_id=(?P\d+).*"', webpage) + if mobj is not None: + return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') + + # Look for Pladform embeds + mobj = re.search( + r']+src="(?Phttps?://out\.pladform\.ru/player\?.+?)"', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Pladform') + def check_video(vurl): + if YoutubeIE.suitable(vurl): + return True vpath = compat_urlparse.urlparse(vurl).path vext = determine_ext(vpath) return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') @@ -1082,7 +1232,8 @@ class GenericIE(InfoExtractor): JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) - .*?file\s*:\s*["\'](.*?)["\']''', webpage)) + .*? + ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) @@ -1117,10 +1268,16 @@ class GenericIE(InfoExtractor): # HTML5 video found = re.findall(r'(?s).*?]*)?\s+src=["\'](.*?)["\']', webpage) if not found: + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( r'(?i)