+version <unreleased>
+
+Extractors
++ [generic] Support complex JWPlayer embedded videos (#12030)
+
+
version 2017.02.16
Core
from __future__ import unicode_literals
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
unified_strdate,
clean_html,
)
-class ArchiveOrgIE(JWPlatformBaseIE):
+class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
fix_xml_ampersands,
float_or_none,
int_or_none,
+ js_to_json,
parse_iso8601,
RegexNotFoundError,
sanitize_filename,
})
return formats
+ @staticmethod
+ def _find_jwplayer_data(webpage):
+ mobj = re.search(
+ r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+ webpage)
+ if mobj:
+ return mobj.group('options')
+
+ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+ jwplayer_data = self._parse_json(
+ self._find_jwplayer_data(webpage), video_id,
+ transform_source=js_to_json)
+ return self._parse_jwplayer_data(
+ jwplayer_data, video_id, *args, **kwargs)
+
+ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
+ m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ # JWPlayer backward compatibility: flattened playlists
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if 'playlist' not in jwplayer_data:
+ jwplayer_data = {'playlist': [jwplayer_data]}
+
+ entries = []
+
+ # JWPlayer backward compatibility: single playlist item
+ # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
+ if not isinstance(jwplayer_data['playlist'], list):
+ jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+
+ for video_data in jwplayer_data['playlist']:
+ # JWPlayer backward compatibility: flattened sources
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+ if 'sources' not in video_data:
+ video_data['sources'] = [video_data]
+
+ this_video_id = video_id or video_data['mediaid']
+
+ formats = []
+ for source in video_data['sources']:
+ source_url = self._proto_relative_url(source['file'])
+ if base_url:
+ source_url = compat_urlparse.urljoin(base_url, source_url)
+ source_type = source.get('type') or ''
+ ext = mimetype2ext(source_type) or determine_ext(source_url)
+ if source_type == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ source_url, this_video_id, mpd_id=mpd_id, fatal=False))
+ # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+ elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+ formats.append({
+ 'url': source_url,
+ 'vcodec': 'none',
+ 'ext': ext,
+ })
+ else:
+ height = int_or_none(source.get('height'))
+ if height is None:
+ # Often no height is provided but there is a label in
+ # format like 1080p.
+ height = int_or_none(self._search_regex(
+ r'^(\d{3,})[pP]$', source.get('label') or '',
+ 'height', default=None))
+ a_format = {
+ 'url': source_url,
+ 'width': int_or_none(source.get('width')),
+ 'height': height,
+ 'ext': ext,
+ }
+ if source_url.startswith('rtmp'):
+ a_format['ext'] = 'flv'
+
+ # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+ # of jwplayer.flash.swf
+ rtmp_url_parts = re.split(
+ r'((?:mp4|mp3|flv):)', source_url, 1)
+ if len(rtmp_url_parts) == 3:
+ rtmp_url, prefix, play_path = rtmp_url_parts
+ a_format.update({
+ 'url': rtmp_url,
+ 'play_path': prefix + play_path,
+ })
+ if rtmp_params:
+ a_format.update(rtmp_params)
+ formats.append(a_format)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ tracks = video_data.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if track.get('kind') != 'captions':
+ continue
+ track_url = urljoin(base_url, track.get('file'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('label') or 'en', []).append({
+ 'url': self._proto_relative_url(track_url)
+ })
+
+ entries.append({
+ 'id': this_video_id,
+ 'title': video_data['title'] if require_title else video_data.get('title'),
+ 'description': video_data.get('description'),
+ 'thumbnail': self._proto_relative_url(video_data.get('image')),
+ 'timestamp': int_or_none(video_data.get('pubdate')),
+ 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
+ 'subtitles': subtitles,
+ 'formats': formats,
+ })
+ if len(entries) == 1:
+ return entries[0]
+ else:
+ return self.playlist_result(entries)
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
float_or_none,
HEADRequest,
is_html,
+ js_to_json,
orderedSet,
sanitized_Request,
smuggle_url,
'skip_download': True,
}
},
+ # Complex jwplayer
+ {
+ 'url': 'http://www.indiedb.com/games/king-machine/videos',
+ 'info_dict': {
+ 'id': 'videos',
+ 'ext': 'mp4',
+ 'title': 'king machine trailer 1',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ },
# rtl.nl embed
{
'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
self._sort_formats(entry['formats'])
return self.playlist_result(entries)
+ jwplayer_data_str = self._find_jwplayer_data(webpage)
+ if jwplayer_data_str:
+ try:
+ jwplayer_data = self._parse_json(
+ jwplayer_data_str, video_id, transform_source=js_to_json)
+ return self._parse_jwplayer_data(jwplayer_data, video_id)
+ except ExtractorError:
+ pass
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
- determine_ext,
- float_or_none,
- int_or_none,
- js_to_json,
- mimetype2ext,
- urljoin,
-)
-class JWPlatformBaseIE(InfoExtractor):
- @staticmethod
- def _find_jwplayer_data(webpage):
- # TODO: Merge this with JWPlayer-related codes in generic.py
-
- mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
- webpage)
- if mobj:
- return mobj.group('options')
-
- def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
- jwplayer_data = self._parse_json(
- self._find_jwplayer_data(webpage), video_id,
- transform_source=js_to_json)
- return self._parse_jwplayer_data(
- jwplayer_data, video_id, *args, **kwargs)
-
- def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
- m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- # JWPlayer backward compatibility: flattened playlists
- # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
- if 'playlist' not in jwplayer_data:
- jwplayer_data = {'playlist': [jwplayer_data]}
-
- entries = []
-
- # JWPlayer backward compatibility: single playlist item
- # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
- if not isinstance(jwplayer_data['playlist'], list):
- jwplayer_data['playlist'] = [jwplayer_data['playlist']]
-
- for video_data in jwplayer_data['playlist']:
- # JWPlayer backward compatibility: flattened sources
- # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
- if 'sources' not in video_data:
- video_data['sources'] = [video_data]
-
- this_video_id = video_id or video_data['mediaid']
-
- formats = []
- for source in video_data['sources']:
- source_url = self._proto_relative_url(source['file'])
- if base_url:
- source_url = compat_urlparse.urljoin(base_url, source_url)
- source_type = source.get('type') or ''
- ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
- elif ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- source_url, this_video_id, mpd_id=mpd_id, fatal=False))
- # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
- elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
- formats.append({
- 'url': source_url,
- 'vcodec': 'none',
- 'ext': ext,
- })
- else:
- height = int_or_none(source.get('height'))
- if height is None:
- # Often no height is provided but there is a label in
- # format like 1080p.
- height = int_or_none(self._search_regex(
- r'^(\d{3,})[pP]$', source.get('label') or '',
- 'height', default=None))
- a_format = {
- 'url': source_url,
- 'width': int_or_none(source.get('width')),
- 'height': height,
- 'ext': ext,
- }
- if source_url.startswith('rtmp'):
- a_format['ext'] = 'flv'
-
- # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
- # of jwplayer.flash.swf
- rtmp_url_parts = re.split(
- r'((?:mp4|mp3|flv):)', source_url, 1)
- if len(rtmp_url_parts) == 3:
- rtmp_url, prefix, play_path = rtmp_url_parts
- a_format.update({
- 'url': rtmp_url,
- 'play_path': prefix + play_path,
- })
- if rtmp_params:
- a_format.update(rtmp_params)
- formats.append(a_format)
- self._sort_formats(formats)
-
- subtitles = {}
- tracks = video_data.get('tracks')
- if tracks and isinstance(tracks, list):
- for track in tracks:
- if track.get('kind') != 'captions':
- continue
- track_url = urljoin(base_url, track.get('file'))
- if not track_url:
- continue
- subtitles.setdefault(track.get('label') or 'en', []).append({
- 'url': self._proto_relative_url(track_url)
- })
-
- entries.append({
- 'id': this_video_id,
- 'title': video_data['title'] if require_title else video_data.get('title'),
- 'description': video_data.get('description'),
- 'thumbnail': self._proto_relative_url(video_data.get('image')),
- 'timestamp': int_or_none(video_data.get('pubdate')),
- 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
- 'subtitles': subtitles,
- 'formats': formats,
- })
- if len(entries) == 1:
- return entries[0]
- else:
- return self.playlist_result(entries)
-
-
-class JWPlatformIE(JWPlatformBaseIE):
+class JWPlatformIE(InfoExtractor):
_VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
_TEST = {
'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
# coding: utf-8
from __future__ import unicode_literals
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
ExtractorError,
js_to_json,
)
-class OnDemandKoreaIE(JWPlatformBaseIE):
+class OnDemandKoreaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html'
_TEST = {
'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html',
comment_count = self._extract_count(
r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
- """
- video_variables = {}
- for video_variablename, quote, video_variable in re.findall(
- r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage):
- video_variables[video_variablename] = video_variable
-
- video_urls = []
- for encoded_video_url in re.findall(
- r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage):
- for varname, varval in video_variables.items():
- encoded_video_url = encoded_video_url.replace(varname, varval)
- video_urls.append(re.sub(r'[\s+]', '', encoded_video_url))
-
- if webpage.find('"encrypted":true') != -1:
- password = compat_urllib_parse_unquote_plus(
- self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
- video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
-
- formats = []
- for video_url in video_urls:
- path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[5].split('_')[:2]
- format = '-'.join(format)
-
- m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
- if m is None:
- height = None
- tbr = None
- else:
- height = int(m.group('height'))
- tbr = int(m.group('tbr'))
-
- formats.append({
- 'url': video_url,
- 'ext': extension,
- 'format': format,
- 'format_id': format,
- 'tbr': tbr,
- 'height': height,
- })
- self._sort_formats(formats)
- """
-
page_params = self._parse_json(self._search_regex(
r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
webpage, 'page parameters', group='data', default='{}'),
import re
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
str_to_int,
)
-class PornoXOIE(JWPlatformBaseIE):
+class PornoXOIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html'
_TEST = {
'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html',
from __future__ import unicode_literals
from .common import InfoExtractor
-from .jwplatform import JWPlatformBaseIE
from ..compat import compat_str
-class RENTVIE(JWPlatformBaseIE):
+class RENTVIE(InfoExtractor):
_VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)'
_TESTS = [{
'url': 'http://ren.tv/video/epizod/118577',
import re
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
js_to_json,
get_element_by_class,
)
-class RudoIE(JWPlatformBaseIE):
+class RudoIE(InfoExtractor):
_VALID_URL = r'https?://rudo\.video/vod/(?P<id>[0-9a-zA-Z]+)'
_TEST = {
# coding: utf-8
from __future__ import unicode_literals
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import js_to_json
-class ScreencastOMaticIE(JWPlatformBaseIE):
+class ScreencastOMaticIE(InfoExtractor):
_VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
_TEST = {
'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
import re
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
float_or_none,
parse_iso8601,
)
-class SendtoNewsIE(JWPlatformBaseIE):
+class SendtoNewsIE(InfoExtractor):
_VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)'
_TEST = {
import re
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import remove_end
-class ThisAVIE(JWPlatformBaseIE):
+class ThisAVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
_TESTS = [{
'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
# coding: utf-8
from __future__ import unicode_literals
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
)
-class TVNoeIE(JWPlatformBaseIE):
+class TVNoeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.tvnoe.cz/video/10362',
import re
-from .jwplatform import JWPlatformBaseIE
+from .common import InfoExtractor
from ..utils import (
decode_packed_codes,
js_to_json,
)
-class VidziIE(JWPlatformBaseIE):
+class VidziIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'http://vidzi.tv/cghql9yq6emu.html',
from __future__ import unicode_literals
+from .common import InfoExtractor
from .youtube import YoutubeIE
-from .jwplatform import JWPlatformBaseIE
-class WimpIE(JWPlatformBaseIE):
+class WimpIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.wimp.com/maru-is-exhausted/',