extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8
   9 class Channel9IE(InfoExtractor):
  10     '''
  11     Common extractor for channel9.msdn.com.
  12
  13     The type of provided URL (video or playlist) is determined according to
  14     meta Search.PageType from web page HTML rather than URL itself, as it is
  15     not always possible to do.
  16     '''
  17     IE_DESC = 'Channel 9'
  18     IE_NAME = 'channel9'
  19     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  20
  21     _TESTS = [
  22         {
  23             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  24             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  25             'info_dict': {
  26                 'id': 'Events/TechEd/Australia/2013/KOS002',
  27                 'ext': 'mp4',
  28                 'title': 'Developer Kick-Off Session: Stuff We Love',
  29                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  30                 'duration': 4576,
  31                 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  32                 'session_code': 'KOS002',
  33                 'session_day': 'Day 1',
  34                 'session_room': 'Arena 1A',
  35                 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
  36             },
  37         },
  38         {
  39             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  40             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  41             'info_dict': {
  42                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  43                 'ext': 'mp4',
  44                 'title': 'Self-service BI with Power BI - nuclear testing',
  45                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  46                 'duration': 1540,
  47                 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  48                 'authors': ['Mike Wilmot'],
  49             },
  50         }
  51     ]
  52
  53     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  54
  55     # Sorted by quality
  56     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  57
  58     def _restore_bytes(self, formatted_size):
  59         if not formatted_size:
  60             return 0
  61         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  62         if not m:
  63             return 0
  64         units = m.group('units')
  65         try:
  66             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
  67         except ValueError:
  68             return 0
  69         size = float(m.group('size'))
  70         return int(size * (1024 ** exponent))
  71
  72     def _formats_from_html(self, html):
  73         FORMAT_REGEX = r'''
  74             (?x)
  75             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  76             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  77             (?:<div\s+class="popup\s+rounded">\s*
  78             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  79             </div>)?                                                # File size part may be missing
  80         '''
  81         # Extract known formats
  82         formats = [{
  83             'url': x.group('url'),
  84             'format_id': x.group('quality'),
  85             'format_note': x.group('note'),
  86             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  87             'filesize': self._restore_bytes(x.group('filesize')),  # File size is approximate
  88             'preference': self._known_formats.index(x.group('quality')),
  89             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  90         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  91
  92         self._sort_formats(formats)
  93
  94         return formats
  95
  96     def _extract_title(self, html):
  97         title = self._html_search_meta('title', html, 'title')
  98         if title is None:
  99             title = self._og_search_title(html)
 100             TITLE_SUFFIX = ' (Channel 9)'
 101             if title is not None and title.endswith(TITLE_SUFFIX):
 102                 title = title[:-len(TITLE_SUFFIX)]
 103         return title
 104
 105     def _extract_description(self, html):
 106         DESCRIPTION_REGEX = r'''(?sx)
 107             <div\s+class="entry-content">\s*
 108             <div\s+id="entry-body">\s*
 109             (?P<description>.+?)\s*
 110             </div>\s*
 111             </div>
 112         '''
 113         m = re.search(DESCRIPTION_REGEX, html)
 114         if m is not None:
 115             return m.group('description')
 116         return self._html_search_meta('description', html, 'description')
 117
 118     def _extract_duration(self, html):
 119         m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 120         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 121
 122     def _extract_slides(self, html):
 123         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 124         return m.group('slidesurl') if m is not None else None
 125
 126     def _extract_zip(self, html):
 127         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 128         return m.group('zipurl') if m is not None else None
 129
 130     def _extract_avg_rating(self, html):
 131         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 132         return float(m.group('avgrating')) if m is not None else 0
 133
 134     def _extract_rating_count(self, html):
 135         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 136         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 137
 138     def _extract_view_count(self, html):
 139         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 140         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 141
 142     def _extract_comment_count(self, html):
 143         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 144         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 145
 146     def _fix_count(self, count):
 147         return int(str(count).replace(',', '')) if count is not None else None
 148
 149     def _extract_authors(self, html):
 150         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 151         if m is None:
 152             return None
 153         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 154
 155     def _extract_session_code(self, html):
 156         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 157         return m.group('code') if m is not None else None
 158
 159     def _extract_session_day(self, html):
 160         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 161         return m.group('day') if m is not None else None
 162
 163     def _extract_session_room(self, html):
 164         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 165         return m.group('room') if m is not None else None
 166
 167     def _extract_session_speakers(self, html):
 168         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 169
 170     def _extract_content(self, html, content_path):
 171         # Look for downloadable content
 172         formats = self._formats_from_html(html)
 173         slides = self._extract_slides(html)
 174         zip_ = self._extract_zip(html)
 175
 176         # Nothing to download
 177         if len(formats) == 0 and slides is None and zip_ is None:
 178             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 179             return
 180
 181         # Extract meta
 182         title = self._extract_title(html)
 183         description = self._extract_description(html)
 184         thumbnail = self._og_search_thumbnail(html)
 185         duration = self._extract_duration(html)
 186         avg_rating = self._extract_avg_rating(html)
 187         rating_count = self._extract_rating_count(html)
 188         view_count = self._extract_view_count(html)
 189         comment_count = self._extract_comment_count(html)
 190
 191         common = {
 192             '_type': 'video',
 193             'id': content_path,
 194             'description': description,
 195             'thumbnail': thumbnail,
 196             'duration': duration,
 197             'avg_rating': avg_rating,
 198             'rating_count': rating_count,
 199             'view_count': view_count,
 200             'comment_count': comment_count,
 201         }
 202
 203         result = []
 204
 205         if slides is not None:
 206             d = common.copy()
 207             d.update({'title': title + '-Slides', 'url': slides})
 208             result.append(d)
 209
 210         if zip_ is not None:
 211             d = common.copy()
 212             d.update({'title': title + '-Zip', 'url': zip_})
 213             result.append(d)
 214
 215         if len(formats) > 0:
 216             d = common.copy()
 217             d.update({'title': title, 'formats': formats})
 218             result.append(d)
 219
 220         return result
 221
 222     def _extract_entry_item(self, html, content_path):
 223         contents = self._extract_content(html, content_path)
 224         if contents is None:
 225             return contents
 226
 227         authors = self._extract_authors(html)
 228
 229         for content in contents:
 230             content['authors'] = authors
 231
 232         return contents
 233
 234     def _extract_session(self, html, content_path):
 235         contents = self._extract_content(html, content_path)
 236         if contents is None:
 237             return contents
 238
 239         session_meta = {'session_code': self._extract_session_code(html),
 240                         'session_day': self._extract_session_day(html),
 241                         'session_room': self._extract_session_room(html),
 242                         'session_speakers': self._extract_session_speakers(html),
 243                         }
 244
 245         for content in contents:
 246             content.update(session_meta)
 247
 248         return contents
 249
 250     def _extract_list(self, content_path):
 251         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 252         entries = [self.url_result(session_url.text, 'Channel9')
 253                    for session_url in rss.findall('./channel/item/link')]
 254         title_text = rss.find('./channel/title').text
 255         return self.playlist_result(entries, content_path, title_text)
 256
 257     def _real_extract(self, url):
 258         mobj = re.match(self._VALID_URL, url)
 259         content_path = mobj.group('contentpath')
 260
 261         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 262
 263         page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
 264         if page_type_m is not None:
 265             page_type = page_type_m.group('pagetype')
 266             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
 267                 return self._extract_entry_item(webpage, content_path)
 268             elif page_type == 'Session':  # Event session page, may contain downloadable content
 269                 return self._extract_session(webpage, content_path)
 270             elif page_type == 'Event':
 271                 return self._extract_list(content_path)
 272             else:
 273                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
 274
 275         else:  # Assuming list
 276             return self._extract_list(content_path)