float_or_none,
HEADRequest,
is_html,
+ js_to_json,
orderedSet,
sanitized_Request,
smuggle_url,
UnsupportedError,
xpath_text,
)
+from .commonprotocols import RtmpIE
from .brightcove import (
BrightcoveLegacyIE,
BrightcoveNewIE,
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .openload import OpenloadIE
+from .videopress import VideoPressIE
+from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
class GenericIE(InfoExtractor):
},
}],
},
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ 'skip': 'video rotates...weekly?',
+ },
+ {
+ # Brightcove:new type [2].
+ 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+ 'md5': '2b35148fcf48da41c9fb4591650784f3',
+ 'info_dict': {
+ 'id': '5348741021001',
+ 'ext': 'mp4',
+ 'upload_date': '20170306',
+ 'uploader_id': '4191638492001',
+ 'timestamp': 1488769918,
+ 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
+
+ },
+ },
+ {
+ # Alternative brightcove <video> attributes
+ 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+ 'info_dict': {
+ 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+ },
+ 'playlist': [{
+ 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+ 'info_dict': {
+ 'id': '5311302538001',
+ 'ext': 'mp4',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+ 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+ 'timestamp': 1486321708,
+ 'upload_date': '20170205',
+ 'uploader_id': '800000640001',
+ },
+ 'only_matching': True,
+ }],
+ },
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
'skip_download': True,
}
},
+ # YouTube <object> embed
+ {
+ 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/',
+ 'md5': '516718101ec834f74318df76259fb3cc',
+ 'info_dict': {
+ 'id': 'msN87y-iEx0',
+ 'ext': 'webm',
+ 'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
+ 'upload_date': '20080526',
+ 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d',
+ 'uploader': 'Christopher Sykes',
+ 'uploader_id': 'ChristopherJSykes',
+ },
+ 'add_ie': ['Youtube'],
+ },
# Camtasia studio
{
'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
},
# LazyYT
{
- 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
+ 'url': 'https://skiplagged.com/',
'info_dict': {
- 'id': '1986',
- 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
+ 'id': 'skiplagged',
+ 'title': 'Skiplagged: The smart way to find cheap flights',
},
- 'playlist_mincount': 2,
+ 'playlist_mincount': 1,
+ 'add_ie': ['Youtube'],
},
# Cinchcast embed
{
'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
},
},
+ # jwplayer rtmp
+ {
+ 'url': 'http://www.suffolk.edu/sjc/',
+ 'info_dict': {
+ 'id': 'sjclive',
+ 'ext': 'flv',
+ 'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
+ 'uploader': 'www.suffolk.edu',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
+ # Complex jwplayer
+ {
+ 'url': 'http://www.indiedb.com/games/king-machine/videos',
+ 'info_dict': {
+ 'id': 'videos',
+ 'ext': 'mp4',
+ 'title': 'king machine trailer 1',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ # JWPlayer config passed as variable
+ 'url': 'http://www.txxx.com/videos/3326530/ariele/',
+ 'info_dict': {
+ 'id': '3326530_hq',
+ 'ext': 'mp4',
+ 'title': 'ARIELE | Tube Cup',
+ 'uploader': 'www.txxx.com',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# rtl.nl embed
{
'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
},
},
- # Kaltura embed protected with referrer
- {
- 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero',
- 'info_dict': {
- 'id': '1_g4fbemnq',
- 'ext': 'mp4',
- 'title': 'Violetta - Achter De Schermen - Ruggero',
- 'description': 'Achter de schermen met Ruggero',
- 'timestamp': 1435133761,
- 'upload_date': '20150624',
- 'uploader_id': 'echojecka',
- },
- },
# Kaltura embed with single quotes
{
'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
},
'add_ie': ['Kaltura'],
},
+ {
+ # Kaltura iframe embed
+ 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/',
+ 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44',
+ 'info_dict': {
+ 'id': '0_f2cfbpwy',
+ 'ext': 'mp4',
+ 'title': 'I. M. Pei: A Centennial Celebration',
+ 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c',
+ 'upload_date': '20170403',
+ 'uploader_id': 'batchUser',
+ 'timestamp': 1491232186,
+ },
+ 'add_ie': ['Kaltura'],
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
'skip_download': True,
},
'add_ie': [TwentyMinutenIE.ie_key()],
- }
+ },
+ {
+ # VideoPress embed
+ 'url': 'https://en.support.wordpress.com/videopress/',
+ 'info_dict': {
+ 'id': 'OcobLTqC',
+ 'ext': 'm4v',
+ 'title': 'IMG_5786',
+ 'timestamp': 1435711927,
+ 'upload_date': '20150701',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [VideoPressIE.ie_key()],
+ },
+ {
+ # Rutube embed
+ 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2',
+ 'info_dict': {
+ 'id': '9b3d5bee0a8740bf70dfd29d3ea43541',
+ 'ext': 'flv',
+ 'title': 'Магаззино: Казань 2',
+ 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a',
+ 'uploader': 'Магаззино',
+ 'upload_date': '20170228',
+ 'uploader_id': '996642',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [RutubeIE.ie_key()],
+ },
+ {
+ # ThePlatform embedded with whitespaces in URLs
+ 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
+ 'only_matching': True,
+ },
+ {
+ # Senate ISVP iframe https
+ 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security',
+ 'md5': 'fb8c70b0b515e5037981a2492099aab8',
+ 'info_dict': {
+ 'id': 'govtaff020316',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player',
+ },
+ 'add_ie': [SenateISVPIE.ie_key()],
+ },
+ {
+ # Limelight embeds (1 channel embed + 4 media embeds)
+ 'url': 'http://www.sedona.com/FacilitatorTraining2017',
+ 'info_dict': {
+ 'id': 'FacilitatorTraining2017',
+ 'title': 'Facilitator Training 2017',
+ },
+ 'playlist_mincount': 5,
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
- # Helper method
- def _playlist_from_matches(matches, getter=None, ie=None):
- urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
- for m in matches)
- return self.playlist_result(
- urlrs, playlist_id=video_id, playlist_title=video_title)
-
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
- return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+ return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
# Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls:
- return _playlist_from_matches(tp_urls, ie='ThePlatform')
+ return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
# Look for Vessel embeds
vessel_urls = VesselIE._extract_urls(webpage)
if vessel_urls:
- return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+ return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage)
if matches:
- return _playlist_from_matches(matches, ie='RtlNl')
+ return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vimeo_urls = VimeoIE._extract_urls(url, webpage)
if vimeo_urls:
- return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
+ return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
data-video-url=|
<embed[^>]+?src=|
embedSWF\(?:\s*|
+ <object[^>]+data=|
new\s+SWFObject\(
)
(["\'])
(?:embed|v|p)/.+?)
\1''', webpage)
if matches:
- return _playlist_from_matches(
- matches, lambda m: unescapeHTML(m[1]))
+ return self.playlist_from_matches(
+ matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
# Look for lazyYT YouTube embed
matches = re.findall(
r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
if matches:
- return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+ return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
# Look for Wordpress "YouTube Video Importer" plugin
matches = re.findall(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
if matches:
- return _playlist_from_matches(matches, lambda m: m[-1])
+ return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
matches = DailymotionIE._extract_urls(webpage)
if matches:
- return _playlist_from_matches(matches)
+ return self.playlist_from_matches(matches, video_id, video_title)
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
- return _playlist_from_matches(
- playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
+ return self.playlist_from_matches(
+ playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
match = re.search(
if mobj is not None:
embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
if embeds:
- return _playlist_from_matches(
- embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
+ return self.playlist_from_matches(
+ embeds, video_id, video_title,
+ getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
# Look for Aparat videos
mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches:
- return _playlist_from_matches(
- matches, getter=unescapeHTML, ie='FunnyOrDie')
+ return self.playlist_from_matches(
+ matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
- return _playlist_from_matches(matches, ie='BBCCoUk')
+ return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
# Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
if sportbox_urls:
- return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+ return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
if xhamster_urls:
- return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+ return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
# Look for embedded TNAFlixNetwork player
tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
if tnaflix_urls:
- return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+ return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
# Look for embedded PornHub player
pornhub_urls = PornHubIE._extract_urls(webpage)
if pornhub_urls:
- return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
+ return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
# Look for embedded DrTuber player
drtuber_urls = DrTuberIE._extract_urls(webpage)
if drtuber_urls:
- return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
+ return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
# Look for embedded RedTube player
redtube_urls = RedTubeIE._extract_urls(webpage)
if redtube_urls:
- return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
+ return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
# Look for embedded Tvigle player
mobj = re.search(
# Look for embedded soundcloud player
soundcloud_urls = SoundcloudIE._extract_urls(webpage)
if soundcloud_urls:
- return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+ return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
# Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage)
if tunein_urls:
- return _playlist_from_matches(tunein_urls)
+ return self.playlist_from_matches(tunein_urls, video_id, video_title)
# Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
return self.url_result(piksel_url, PikselIE.ie_key())
# Look for Limelight embeds
+ limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+ if limelight_urls:
+ return self.playlist_result(
+ limelight_urls, video_id, video_title, video_description)
+
mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
if mobj:
lm = {
'Channel': 'channel',
'ChannelList': 'channel_list',
}
- return self.url_result('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2))
+ return self.url_result(smuggle_url('limelight:%s:%s' % (
+ lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
+ 'Limelight%s' % mobj.group(1), mobj.group(2))
mobj = re.search(
r'''(?sx)
value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
''', webpage)
if mobj:
- return self.url_result('limelight:media:%s' % mobj.group('id'))
+ return self.url_result(smuggle_url(
+ 'limelight:media:%s' % mobj.group('id'),
+ {'source_url': url}), 'LimelightMedia', mobj.group('id'))
# Look for AdobeTVVideo embeds
mobj = re.search(
# Look for DBTV embeds
dbtv_urls = DBTVIE._extract_urls(webpage)
if dbtv_urls:
- return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
+ return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
# Look for Videa embeds
videa_urls = VideaIE._extract_urls(webpage)
if videa_urls:
- return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
+ return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
# Look for 20 minuten embeds
twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
if twentymin_urls:
- return _playlist_from_matches(
- twentymin_urls, ie=TwentyMinutenIE.ie_key())
+ return self.playlist_from_matches(
+ twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
# Look for Openload embeds
openload_urls = OpenloadIE._extract_urls(webpage)
if openload_urls:
- return _playlist_from_matches(
- openload_urls, ie=OpenloadIE.ie_key())
+ return self.playlist_from_matches(
+ openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
+
+ # Look for VideoPress embeds
+ videopress_urls = VideoPressIE._extract_urls(webpage)
+ if videopress_urls:
+ return self.playlist_from_matches(
+ videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
+
+ # Look for Rutube embeds
+ rutube_urls = RutubeIE._extract_urls(webpage)
+ if rutube_urls:
+ return self.playlist_from_matches(
+ rutube_urls, ie=RutubeIE.ie_key())
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
self._sort_formats(entry['formats'])
return self.playlist_result(entries)
+ jwplayer_data = self._find_jwplayer_data(
+ webpage, video_id, transform_source=js_to_json)
+ if jwplayer_data:
+ info = self._parse_jwplayer_data(
+ jwplayer_data, video_id, require_title=False, base_url=url)
+ if not info.get('title'):
+ info['title'] = video_title
+ return info
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
+ if RtmpIE.suitable(vurl):
+ return True
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
+ return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
- self.report_following_redirect(new_url)
- return {
- '_type': 'url',
- 'url': new_url,
- }
+ if new_url != url:
+ self.report_following_redirect(new_url)
+ return {
+ '_type': 'url',
+ 'url': new_url,
+ }
+ else:
+ found = None
if not found:
# twitter:player is a https URL to iframe player that may or may not
'age_limit': age_limit,
}
+ if RtmpIE.suitable(video_url):
+ entry_info_dict.update({
+ '_type': 'url_transparent',
+ 'ie_key': RtmpIE.ie_key(),
+ 'url': video_url,
+ })
+ entries.append(entry_info_dict)
+ continue
+
ext = determine_ext(video_url)
if ext == 'smil':
entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)