youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     parse_filesize,
   9     qualities,
  10 )
  11
  12
  13 class Channel9IE(InfoExtractor):
  14     '''
  15     Common extractor for channel9.msdn.com.
  16
  17     The type of provided URL (video or playlist) is determined according to
  18     meta Search.PageType from web page HTML rather than URL itself, as it is
  19     not always possible to do.
  20     '''
  21     IE_DESC = 'Channel 9'
  22     IE_NAME = 'channel9'
  23     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
  24
  25     _TESTS = [{
  26         'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  27         'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  28         'info_dict': {
  29             'id': 'Events/TechEd/Australia/2013/KOS002',
  30             'ext': 'mp4',
  31             'title': 'Developer Kick-Off Session: Stuff We Love',
  32             'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  33             'duration': 4576,
  34             'thumbnail': 're:http://.*\.jpg',
  35             'session_code': 'KOS002',
  36             'session_day': 'Day 1',
  37             'session_room': 'Arena 1A',
  38             'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
  39                                  'Mads Kristensen'],
  40         },
  41     }, {
  42         'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  43         'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  44         'info_dict': {
  45             'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  46             'ext': 'mp4',
  47             'title': 'Self-service BI with Power BI - nuclear testing',
  48             'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  49             'duration': 1540,
  50             'thumbnail': 're:http://.*\.jpg',
  51             'authors': ['Mike Wilmot'],
  52         },
  53     }, {
  54         # low quality mp4 is best
  55         'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  56         'info_dict': {
  57             'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  58             'ext': 'mp4',
  59             'title': 'Ranges for the Standard Library',
  60             'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
  61             'duration': 5646,
  62             'thumbnail': 're:http://.*\.jpg',
  63         },
  64         'params': {
  65             'skip_download': True,
  66         },
  67     }, {
  68         'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
  69         'info_dict': {
  70             'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
  71             'title': 'Channel 9',
  72         },
  73         'playlist_count': 2,
  74     }, {
  75         'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
  76         'only_matching': True,
  77     }, {
  78         'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
  79         'only_matching': True,
  80     }]
  81
  82     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  83
  84     def _formats_from_html(self, html):
  85         FORMAT_REGEX = r'''
  86             (?x)
  87             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  88             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  89             (?:<div\s+class="popup\s+rounded">\s*
  90             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  91             </div>)?                                                # File size part may be missing
  92         '''
  93         quality = qualities((
  94             'MP3', 'MP4',
  95             'Low Quality WMV', 'Low Quality MP4',
  96             'Mid Quality WMV', 'Mid Quality MP4',
  97             'High Quality WMV', 'High Quality MP4'))
  98         formats = [{
  99             'url': x.group('url'),
 100             'format_id': x.group('quality'),
 101             'format_note': x.group('note'),
 102             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
 103             'filesize_approx': parse_filesize(x.group('filesize')),
 104             'quality': quality(x.group('quality')),
 105             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
 106         } for x in list(re.finditer(FORMAT_REGEX, html))]
 107
 108         self._sort_formats(formats)
 109
 110         return formats
 111
 112     def _extract_title(self, html):
 113         title = self._html_search_meta('title', html, 'title')
 114         if title is None:
 115             title = self._og_search_title(html)
 116             TITLE_SUFFIX = ' (Channel 9)'
 117             if title is not None and title.endswith(TITLE_SUFFIX):
 118                 title = title[:-len(TITLE_SUFFIX)]
 119         return title
 120
 121     def _extract_description(self, html):
 122         DESCRIPTION_REGEX = r'''(?sx)
 123             <div\s+class="entry-content">\s*
 124             <div\s+id="entry-body">\s*
 125             (?P<description>.+?)\s*
 126             </div>\s*
 127             </div>
 128         '''
 129         m = re.search(DESCRIPTION_REGEX, html)
 130         if m is not None:
 131             return m.group('description')
 132         return self._html_search_meta('description', html, 'description')
 133
 134     def _extract_duration(self, html):
 135         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 136         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 137
 138     def _extract_slides(self, html):
 139         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 140         return m.group('slidesurl') if m is not None else None
 141
 142     def _extract_zip(self, html):
 143         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 144         return m.group('zipurl') if m is not None else None
 145
 146     def _extract_avg_rating(self, html):
 147         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 148         return float(m.group('avgrating')) if m is not None else 0
 149
 150     def _extract_rating_count(self, html):
 151         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 152         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 153
 154     def _extract_view_count(self, html):
 155         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 156         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 157
 158     def _extract_comment_count(self, html):
 159         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 160         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 161
 162     def _fix_count(self, count):
 163         return int(str(count).replace(',', '')) if count is not None else None
 164
 165     def _extract_authors(self, html):
 166         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 167         if m is None:
 168             return None
 169         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 170
 171     def _extract_session_code(self, html):
 172         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 173         return m.group('code') if m is not None else None
 174
 175     def _extract_session_day(self, html):
 176         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 177         return m.group('day').strip() if m is not None else None
 178
 179     def _extract_session_room(self, html):
 180         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 181         return m.group('room') if m is not None else None
 182
 183     def _extract_session_speakers(self, html):
 184         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 185
 186     def _extract_content(self, html, content_path):
 187         # Look for downloadable content
 188         formats = self._formats_from_html(html)
 189         slides = self._extract_slides(html)
 190         zip_ = self._extract_zip(html)
 191
 192         # Nothing to download
 193         if len(formats) == 0 and slides is None and zip_ is None:
 194             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 195             return
 196
 197         # Extract meta
 198         title = self._extract_title(html)
 199         description = self._extract_description(html)
 200         thumbnail = self._og_search_thumbnail(html)
 201         duration = self._extract_duration(html)
 202         avg_rating = self._extract_avg_rating(html)
 203         rating_count = self._extract_rating_count(html)
 204         view_count = self._extract_view_count(html)
 205         comment_count = self._extract_comment_count(html)
 206
 207         common = {
 208             '_type': 'video',
 209             'id': content_path,
 210             'description': description,
 211             'thumbnail': thumbnail,
 212             'duration': duration,
 213             'avg_rating': avg_rating,
 214             'rating_count': rating_count,
 215             'view_count': view_count,
 216             'comment_count': comment_count,
 217         }
 218
 219         result = []
 220
 221         if slides is not None:
 222             d = common.copy()
 223             d.update({'title': title + '-Slides', 'url': slides})
 224             result.append(d)
 225
 226         if zip_ is not None:
 227             d = common.copy()
 228             d.update({'title': title + '-Zip', 'url': zip_})
 229             result.append(d)
 230
 231         if len(formats) > 0:
 232             d = common.copy()
 233             d.update({'title': title, 'formats': formats})
 234             result.append(d)
 235
 236         return result
 237
 238     def _extract_entry_item(self, html, content_path):
 239         contents = self._extract_content(html, content_path)
 240         if contents is None:
 241             return contents
 242
 243         if len(contents) > 1:
 244             raise ExtractorError('Got more than one entry')
 245         result = contents[0]
 246         result['authors'] = self._extract_authors(html)
 247
 248         return result
 249
 250     def _extract_session(self, html, content_path):
 251         contents = self._extract_content(html, content_path)
 252         if contents is None:
 253             return contents
 254
 255         session_meta = {
 256             'session_code': self._extract_session_code(html),
 257             'session_day': self._extract_session_day(html),
 258             'session_room': self._extract_session_room(html),
 259             'session_speakers': self._extract_session_speakers(html),
 260         }
 261
 262         for content in contents:
 263             content.update(session_meta)
 264
 265         return self.playlist_result(contents)
 266
 267     def _extract_list(self, video_id, rss_url=None):
 268         if not rss_url:
 269             rss_url = self._RSS_URL % video_id
 270         rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
 271         entries = [self.url_result(session_url.text, 'Channel9')
 272                    for session_url in rss.findall('./channel/item/link')]
 273         title_text = rss.find('./channel/title').text
 274         return self.playlist_result(entries, video_id, title_text)
 275
 276     def _real_extract(self, url):
 277         mobj = re.match(self._VALID_URL, url)
 278         content_path = mobj.group('contentpath')
 279         rss = mobj.group('rss')
 280
 281         if rss:
 282             return self._extract_list(content_path, url)
 283
 284         webpage = self._download_webpage(
 285             url, content_path, 'Downloading web page')
 286
 287         page_type = self._search_regex(
 288             r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
 289             webpage, 'page type', default=None, group='pagetype')
 290         if page_type:
 291             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 292                 return self._extract_entry_item(webpage, content_path)
 293             elif page_type == 'Session':  # Event session page, may contain downloadable content
 294                 return self._extract_session(webpage, content_path)
 295             elif page_type == 'Event':
 296                 return self._extract_list(content_path)
 297             else:
 298                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 299         else:  # Assuming list
 300             return self._extract_list(content_path)