youtube_dl/extractor/camdemy.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_urllib_parse_urlencode,
   9     compat_urlparse,
  10 )
  11 from ..utils import (
  12     clean_html,
  13     parse_duration,
  14     str_to_int,
  15     unified_strdate,
  16 )
  17
  18
  19 class CamdemyIE(InfoExtractor):
  20     _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
  21     _TESTS = [{
  22         # single file
  23         'url': 'http://www.camdemy.com/media/5181/',
  24         'md5': '5a5562b6a98b37873119102e052e311b',
  25         'info_dict': {
  26             'id': '5181',
  27             'ext': 'mp4',
  28             'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
  29             'thumbnail': r're:^https?://.*\.jpg$',
  30             'creator': 'ss11spring',
  31             'duration': 1591,
  32             'upload_date': '20130114',
  33             'view_count': int,
  34         }
  35     }, {
  36         # With non-empty description
  37         # webpage returns "No permission or not login"
  38         'url': 'http://www.camdemy.com/media/13885',
  39         'md5': '4576a3bb2581f86c61044822adbd1249',
  40         'info_dict': {
  41             'id': '13885',
  42             'ext': 'mp4',
  43             'title': 'EverCam + Camdemy QuickStart',
  44             'thumbnail': r're:^https?://.*\.jpg$',
  45             'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
  46             'creator': 'evercam',
  47             'duration': 318,
  48         }
  49     }, {
  50         # External source (YouTube)
  51         'url': 'http://www.camdemy.com/media/14842',
  52         'info_dict': {
  53             'id': '2vsYQzNIsJo',
  54             'ext': 'mp4',
  55             'title': 'Excel 2013 Tutorial - How to add Password Protection',
  56             'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
  57             'upload_date': '20130211',
  58             'uploader': 'Hun Kim',
  59             'uploader_id': 'hunkimtutorials',
  60         },
  61         'params': {
  62             'skip_download': True,
  63         },
  64     }]
  65
  66     def _real_extract(self, url):
  67         video_id = self._match_id(url)
  68
  69         webpage = self._download_webpage(url, video_id)
  70
  71         src_from = self._html_search_regex(
  72             r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
  73             webpage, 'external source', default=None, group='url')
  74         if src_from:
  75             return self.url_result(src_from)
  76
  77         oembed_obj = self._download_json(
  78             'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
  79
  80         title = oembed_obj['title']
  81         thumb_url = oembed_obj['thumbnail_url']
  82         video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
  83         file_list_doc = self._download_xml(
  84             compat_urlparse.urljoin(video_folder, 'fileList.xml'),
  85             video_id, 'Downloading filelist XML')
  86         file_name = file_list_doc.find('./video/item/fileName').text
  87         video_url = compat_urlparse.urljoin(video_folder, file_name)
  88
  89         # Some URLs return "No permission or not login" in a webpage despite being
  90         # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
  91         upload_date = unified_strdate(self._search_regex(
  92             r'>published on ([^<]+)<', webpage,
  93             'upload date', default=None))
  94         view_count = str_to_int(self._search_regex(
  95             r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
  96             webpage, 'view count', default=None))
  97         description = self._html_search_meta(
  98             'description', webpage, default=None) or clean_html(
  99             oembed_obj.get('description'))
 100
 101         return {
 102             'id': video_id,
 103             'url': video_url,
 104             'title': title,
 105             'thumbnail': thumb_url,
 106             'description': description,
 107             'creator': oembed_obj.get('author_name'),
 108             'duration': parse_duration(oembed_obj.get('duration')),
 109             'upload_date': upload_date,
 110             'view_count': view_count,
 111         }
 112
 113
 114 class CamdemyFolderIE(InfoExtractor):
 115     _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
 116     _TESTS = [{
 117         # links with trailing slash
 118         'url': 'http://www.camdemy.com/folder/450',
 119         'info_dict': {
 120             'id': '450',
 121             'title': '信號與系統 2012 & 2011 (Signals and Systems)',
 122         },
 123         'playlist_mincount': 145
 124     }, {
 125         # links without trailing slash
 126         # and multi-page
 127         'url': 'http://www.camdemy.com/folder/853',
 128         'info_dict': {
 129             'id': '853',
 130             'title': '科學計算 - 使用 Matlab'
 131         },
 132         'playlist_mincount': 20
 133     }, {
 134         # with displayMode parameter. For testing the codes to add parameters
 135         'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
 136         'info_dict': {
 137             'id': '853',
 138             'title': '科學計算 - 使用 Matlab'
 139         },
 140         'playlist_mincount': 20
 141     }]
 142
 143     def _real_extract(self, url):
 144         folder_id = self._match_id(url)
 145
 146         # Add displayMode=list so that all links are displayed in a single page
 147         parsed_url = list(compat_urlparse.urlparse(url))
 148         query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
 149         query.update({'displayMode': 'list'})
 150         parsed_url[4] = compat_urllib_parse_urlencode(query)
 151         final_url = compat_urlparse.urlunparse(parsed_url)
 152
 153         page = self._download_webpage(final_url, folder_id)
 154         matches = re.findall(r"href='(/media/\d+/?)'", page)
 155
 156         entries = [self.url_result('http://www.camdemy.com' + media_path)
 157                    for media_path in matches]
 158
 159         folder_title = self._html_search_meta('keywords', page)
 160
 161         return self.playlist_result(entries, folder_id, folder_title)