youtube_dl/extractor/raywenderlich.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from .vimeo import VimeoIE
   7 from ..compat import compat_str
   8 from ..utils import (
   9     ExtractorError,
  10     int_or_none,
  11     merge_dicts,
  12     try_get,
  13     unescapeHTML,
  14     unified_timestamp,
  15     urljoin,
  16 )
  17
  18
  19 class RayWenderlichIE(InfoExtractor):
  20     _VALID_URL = r'''(?x)
  21                     https?://
  22                         (?:
  23                             videos\.raywenderlich\.com/courses|
  24                             (?:www\.)?raywenderlich\.com
  25                         )/
  26                         (?P<course_id>[^/]+)/lessons/(?P<id>\d+)
  27                     '''
  28
  29     _TESTS = [{
  30         'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1',
  31         'info_dict': {
  32             'id': '248377018',
  33             'ext': 'mp4',
  34             'title': 'Introduction',
  35             'description': 'md5:804d031b3efa9fcb49777d512d74f722',
  36             'timestamp': 1513906277,
  37             'upload_date': '20171222',
  38             'duration': 133,
  39             'uploader': 'Ray Wenderlich',
  40             'uploader_id': 'user3304672',
  41         },
  42         'params': {
  43             'noplaylist': True,
  44             'skip_download': True,
  45         },
  46         'add_ie': [VimeoIE.ie_key()],
  47         'expected_warnings': ['HTTP Error 403: Forbidden'],
  48     }, {
  49         'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1',
  50         'only_matching': True,
  51     }]
  52
  53     @staticmethod
  54     def _extract_video_id(data, lesson_id):
  55         if not data:
  56             return
  57         groups = try_get(data, lambda x: x['groups'], list) or []
  58         if not groups:
  59             return
  60         for group in groups:
  61             if not isinstance(group, dict):
  62                 continue
  63             contents = try_get(data, lambda x: x['contents'], list) or []
  64             for content in contents:
  65                 if not isinstance(content, dict):
  66                     continue
  67                 ordinal = int_or_none(content.get('ordinal'))
  68                 if ordinal != lesson_id:
  69                     continue
  70                 video_id = content.get('identifier')
  71                 if video_id:
  72                     return compat_str(video_id)
  73
  74     def _real_extract(self, url):
  75         mobj = re.match(self._VALID_URL, url)
  76         course_id, lesson_id = mobj.group('course_id', 'id')
  77         display_id = '%s/%s' % (course_id, lesson_id)
  78
  79         webpage = self._download_webpage(url, display_id)
  80
  81         thumbnail = self._og_search_thumbnail(
  82             webpage, default=None) or self._html_search_meta(
  83             'twitter:image', webpage, 'thumbnail')
  84
  85         if '>Subscribe to unlock' in webpage:
  86             raise ExtractorError(
  87                 'This content is only available for subscribers',
  88                 expected=True)
  89
  90         info = {
  91             'thumbnail': thumbnail,
  92         }
  93
  94         vimeo_id = self._search_regex(
  95             r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None)
  96
  97         if not vimeo_id:
  98             data = self._parse_json(
  99                 self._search_regex(
 100                     r'data-collection=(["\'])(?P<data>{.+?})\1', webpage,
 101                     'data collection', default='{}', group='data'),
 102                 display_id, transform_source=unescapeHTML, fatal=False)
 103             video_id = self._extract_video_id(
 104                 data, lesson_id) or self._search_regex(
 105                 r'/videos/(\d+)/', thumbnail, 'video id')
 106             headers = {
 107                 'Referer': url,
 108                 'X-Requested-With': 'XMLHttpRequest',
 109             }
 110             csrf_token = self._html_search_meta(
 111                 'csrf-token', webpage, 'csrf token', default=None)
 112             if csrf_token:
 113                 headers['X-CSRF-Token'] = csrf_token
 114             video = self._download_json(
 115                 'https://videos.raywenderlich.com/api/v1/videos/%s.json'
 116                 % video_id, display_id, headers=headers)['video']
 117             vimeo_id = video['clips'][0]['provider_id']
 118             info.update({
 119                 '_type': 'url_transparent',
 120                 'title': video.get('name'),
 121                 'description': video.get('description') or video.get(
 122                     'meta_description'),
 123                 'duration': int_or_none(video.get('duration')),
 124                 'timestamp': unified_timestamp(video.get('created_at')),
 125             })
 126
 127         return merge_dicts(info, self.url_result(
 128             VimeoIE._smuggle_referrer(
 129                 'https://player.vimeo.com/video/%s' % vimeo_id, url),
 130             ie=VimeoIE.ie_key(), video_id=vimeo_id))
 131
 132
 133 class RayWenderlichCourseIE(InfoExtractor):
 134     _VALID_URL = r'''(?x)
 135                     https?://
 136                         (?:
 137                             videos\.raywenderlich\.com/courses|
 138                             (?:www\.)?raywenderlich\.com
 139                         )/
 140                         (?P<id>[^/]+)
 141                     '''
 142
 143     _TEST = {
 144         'url': 'https://www.raywenderlich.com/3530-testing-in-ios',
 145         'info_dict': {
 146             'title': 'Testing in iOS',
 147             'id': '3530-testing-in-ios',
 148         },
 149         'params': {
 150             'noplaylist': False,
 151         },
 152         'playlist_count': 29,
 153     }
 154
 155     @classmethod
 156     def suitable(cls, url):
 157         return False if RayWenderlichIE.suitable(url) else super(
 158             RayWenderlichCourseIE, cls).suitable(url)
 159
 160     def _real_extract(self, url):
 161         course_id = self._match_id(url)
 162
 163         webpage = self._download_webpage(url, course_id)
 164
 165         entries = []
 166         lesson_urls = set()
 167         for lesson_url in re.findall(
 168                 r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage):
 169             if lesson_url in lesson_urls:
 170                 continue
 171             lesson_urls.add(lesson_url)
 172             entries.append(self.url_result(
 173                 urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key()))
 174
 175         title = self._og_search_title(
 176             webpage, default=None) or self._html_search_meta(
 177             'twitter:title', webpage, 'title', default=None)
 178
 179         return self.playlist_result(entries, course_id, title)