youtube_dl/extractor/appletrailers.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     int_or_none,
  10     parse_duration,
  11     unified_strdate,
  12 )
  13
  14
  15 class AppleTrailersIE(InfoExtractor):
  16     IE_NAME = 'appletrailers'
  17     _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
  18     _TESTS = [{
  19         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
  20         'info_dict': {
  21             'id': '5111',
  22             'title': 'Man of Steel',
  23         },
  24         'playlist': [
  25             {
  26                 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
  27                 'info_dict': {
  28                     'id': 'manofsteel-trailer4',
  29                     'ext': 'mov',
  30                     'duration': 111,
  31                     'title': 'Trailer 4',
  32                     'upload_date': '20130523',
  33                     'uploader_id': 'wb',
  34                 },
  35             },
  36             {
  37                 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
  38                 'info_dict': {
  39                     'id': 'manofsteel-trailer3',
  40                     'ext': 'mov',
  41                     'duration': 182,
  42                     'title': 'Trailer 3',
  43                     'upload_date': '20130417',
  44                     'uploader_id': 'wb',
  45                 },
  46             },
  47             {
  48                 'md5': 'd0f1e1150989b9924679b441f3404d48',
  49                 'info_dict': {
  50                     'id': 'manofsteel-trailer',
  51                     'ext': 'mov',
  52                     'duration': 148,
  53                     'title': 'Trailer',
  54                     'upload_date': '20121212',
  55                     'uploader_id': 'wb',
  56                 },
  57             },
  58             {
  59                 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
  60                 'info_dict': {
  61                     'id': 'manofsteel-teaser',
  62                     'ext': 'mov',
  63                     'duration': 93,
  64                     'title': 'Teaser',
  65                     'upload_date': '20120721',
  66                     'uploader_id': 'wb',
  67                 },
  68             },
  69         ]
  70     }, {
  71         'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
  72         'info_dict': {
  73             'id': 'blackthorn',
  74         },
  75         'playlist_mincount': 2,
  76         'expected_warnings': ['Unable to download JSON metadata'],
  77     }, {
  78         # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
  79         'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
  80         'info_dict': {
  81             'id': '15881',
  82             'title': 'Kung Fu Panda 3',
  83         },
  84         'playlist_mincount': 4,
  85     }, {
  86         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
  87         'only_matching': True,
  88     }, {
  89         'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
  90         'only_matching': True,
  91     }]
  92
  93     _JSON_RE = r'iTunes.playURL\((.*?)\);'
  94
  95     def _real_extract(self, url):
  96         mobj = re.match(self._VALID_URL, url)
  97         movie = mobj.group('movie')
  98         uploader_id = mobj.group('company')
  99
 100         webpage = self._download_webpage(url, movie)
 101         film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
 102         film_data = self._download_json(
 103             'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
 104             film_id, fatal=False)
 105
 106         if film_data:
 107             entries = []
 108             for clip in film_data.get('clips', []):
 109                 clip_title = clip['title']
 110
 111                 formats = []
 112                 for version, version_data in clip.get('versions', {}).items():
 113                     for size, size_data in version_data.get('sizes', {}).items():
 114                         src = size_data.get('src')
 115                         if not src:
 116                             continue
 117                         formats.append({
 118                             'format_id': '%s-%s' % (version, size),
 119                             'url': re.sub(r'_(\d+p.mov)', r'_h\1', src),
 120                             'width': int_or_none(size_data.get('width')),
 121                             'height': int_or_none(size_data.get('height')),
 122                             'language': version[:2],
 123                         })
 124                 self._sort_formats(formats)
 125
 126                 entries.append({
 127                     'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
 128                     'formats': formats,
 129                     'title': clip_title,
 130                     'thumbnail': clip.get('screen') or clip.get('thumb'),
 131                     'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
 132                     'upload_date': unified_strdate(clip.get('posted')),
 133                     'uploader_id': uploader_id,
 134                 })
 135
 136             page_data = film_data.get('page', {})
 137             return self.playlist_result(entries, film_id, page_data.get('movie_title'))
 138
 139         playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
 140
 141         def fix_html(s):
 142             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
 143             s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
 144             # The ' in the onClick attributes are not escaped, it couldn't be parsed
 145             # like: http://trailers.apple.com/trailers/wb/gravity/
 146
 147             def _clean_json(m):
 148                 return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
 149             s = re.sub(self._JSON_RE, _clean_json, s)
 150             s = '<html>%s</html>' % s
 151             return s
 152         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 153
 154         playlist = []
 155         for li in doc.findall('./div/ul/li'):
 156             on_click = li.find('.//a').attrib['onClick']
 157             trailer_info_json = self._search_regex(self._JSON_RE,
 158                                                    on_click, 'trailer info')
 159             trailer_info = json.loads(trailer_info_json)
 160             first_url = trailer_info.get('url')
 161             if not first_url:
 162                 continue
 163             title = trailer_info['title']
 164             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 165             thumbnail = li.find('.//img').attrib['src']
 166             upload_date = trailer_info['posted'].replace('-', '')
 167
 168             runtime = trailer_info['runtime']
 169             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
 170             duration = None
 171             if m:
 172                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 173
 174             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
 175             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
 176             settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 177
 178             formats = []
 179             for format in settings['metadata']['sizes']:
 180                 # The src is a file pointing to the real video file
 181                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
 182                 formats.append({
 183                     'url': format_url,
 184                     'format': format['type'],
 185                     'width': int_or_none(format['width']),
 186                     'height': int_or_none(format['height']),
 187                 })
 188
 189             self._sort_formats(formats)
 190
 191             playlist.append({
 192                 '_type': 'video',
 193                 'id': video_id,
 194                 'formats': formats,
 195                 'title': title,
 196                 'duration': duration,
 197                 'thumbnail': thumbnail,
 198                 'upload_date': upload_date,
 199                 'uploader_id': uploader_id,
 200                 'http_headers': {
 201                     'User-Agent': 'QuickTime compatible (youtube-dl)',
 202                 },
 203             })
 204
 205         return {
 206             '_type': 'playlist',
 207             'id': movie,
 208             'entries': playlist,
 209         }
 210
 211
 212 class AppleTrailersSectionIE(InfoExtractor):
 213     IE_NAME = 'appletrailers:section'
 214     _SECTIONS = {
 215         'justadded': {
 216             'feed_path': 'just_added',
 217             'title': 'Just Added',
 218         },
 219         'exclusive': {
 220             'feed_path': 'exclusive',
 221             'title': 'Exclusive',
 222         },
 223         'justhd': {
 224             'feed_path': 'just_hd',
 225             'title': 'Just HD',
 226         },
 227         'mostpopular': {
 228             'feed_path': 'most_pop',
 229             'title': 'Most Popular',
 230         },
 231         'moviestudios': {
 232             'feed_path': 'studios',
 233             'title': 'Movie Studios',
 234         },
 235     }
 236     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
 237     _TESTS = [{
 238         'url': 'http://trailers.apple.com/#section=justadded',
 239         'info_dict': {
 240             'title': 'Just Added',
 241             'id': 'justadded',
 242         },
 243         'playlist_mincount': 80,
 244     }, {
 245         'url': 'http://trailers.apple.com/#section=exclusive',
 246         'info_dict': {
 247             'title': 'Exclusive',
 248             'id': 'exclusive',
 249         },
 250         'playlist_mincount': 80,
 251     }, {
 252         'url': 'http://trailers.apple.com/#section=justhd',
 253         'info_dict': {
 254             'title': 'Just HD',
 255             'id': 'justhd',
 256         },
 257         'playlist_mincount': 80,
 258     }, {
 259         'url': 'http://trailers.apple.com/#section=mostpopular',
 260         'info_dict': {
 261             'title': 'Most Popular',
 262             'id': 'mostpopular',
 263         },
 264         'playlist_mincount': 80,
 265     }, {
 266         'url': 'http://trailers.apple.com/#section=moviestudios',
 267         'info_dict': {
 268             'title': 'Movie Studios',
 269             'id': 'moviestudios',
 270         },
 271         'playlist_mincount': 80,
 272     }]
 273
 274     def _real_extract(self, url):
 275         section = self._match_id(url)
 276         section_data = self._download_json(
 277             'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
 278             section)
 279         entries = [
 280             self.url_result('http://trailers.apple.com' + e['location'])
 281             for e in section_data]
 282         return self.playlist_result(entries, section, self._SECTIONS[section]['title'])