youtube_dl/extractor/channel9.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8 class Channel9IE(InfoExtractor):
   9     '''
  10     Common extractor for channel9.msdn.com.
  11
  12     The type of provided URL (video or playlist) is determined according to
  13     meta Search.PageType from web page HTML rather than URL itself, as it is
  14     not always possible to do.
  15     '''
  16     IE_DESC = 'Channel 9'
  17     IE_NAME = 'channel9'
  18     _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  19
  20     _TESTS = [
  21         {
  22             'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  23             'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  24             'info_dict': {
  25                 'id': 'Events/TechEd/Australia/2013/KOS002',
  26                 'ext': 'mp4',
  27                 'title': 'Developer Kick-Off Session: Stuff We Love',
  28                 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  29                 'duration': 4576,
  30                 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  31                 'session_code': 'KOS002',
  32                 'session_day': 'Day 1',
  33                 'session_room': 'Arena 1A',
  34                 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
  35             },
  36         },
  37         {
  38             'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  39             'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  40             'info_dict': {
  41                 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  42                 'ext': 'mp4',
  43                 'title': 'Self-service BI with Power BI - nuclear testing',
  44                 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  45                 'duration': 1540,
  46                 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  47                 'authors': [ 'Mike Wilmot' ],
  48             },
  49         }
  50     ]
  51
  52     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  53
  54     # Sorted by quality
  55     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  56
  57     def _restore_bytes(self, formatted_size):
  58         if not formatted_size:
  59             return 0
  60         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  61         if not m:
  62             return 0
  63         units = m.group('units')
  64         try:
  65             exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
  66         except ValueError:
  67             return 0
  68         size = float(m.group('size'))
  69         return int(size * (1024 ** exponent))
  70
  71     def _formats_from_html(self, html):
  72         FORMAT_REGEX = r'''
  73             (?x)
  74             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  75             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  76             (?:<div\s+class="popup\s+rounded">\s*
  77             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  78             </div>)?                                                # File size part may be missing
  79         '''
  80         # Extract known formats
  81         formats = [{
  82             'url': x.group('url'),
  83             'format_id': x.group('quality'),
  84             'format_note': x.group('note'),
  85             'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  86             'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
  87             'preference': self._known_formats.index(x.group('quality')),
  88             'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  89         } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  90
  91         self._sort_formats(formats)
  92
  93         return formats
  94
  95     def _extract_title(self, html):
  96         title = self._html_search_meta('title', html, 'title')
  97         if title is None:
  98             title = self._og_search_title(html)
  99             TITLE_SUFFIX = ' (Channel 9)'
 100             if title is not None and title.endswith(TITLE_SUFFIX):
 101                 title = title[:-len(TITLE_SUFFIX)]
 102         return title
 103
 104     def _extract_description(self, html):
 105         DESCRIPTION_REGEX = r'''(?sx)
 106             <div\s+class="entry-content">\s*
 107             <div\s+id="entry-body">\s*
 108             (?P<description>.+?)\s*
 109             </div>\s*
 110             </div>
 111         '''
 112         m = re.search(DESCRIPTION_REGEX, html)
 113         if m is not None:
 114             return m.group('description')
 115         return self._html_search_meta('description', html, 'description')
 116
 117     def _extract_duration(self, html):
 118         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 119         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 120
 121     def _extract_slides(self, html):
 122         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 123         return m.group('slidesurl') if m is not None else None
 124
 125     def _extract_zip(self, html):
 126         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 127         return m.group('zipurl') if m is not None else None
 128
 129     def _extract_avg_rating(self, html):
 130         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 131         return float(m.group('avgrating')) if m is not None else 0
 132
 133     def _extract_rating_count(self, html):
 134         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 135         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 136
 137     def _extract_view_count(self, html):
 138         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 139         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 140
 141     def _extract_comment_count(self, html):
 142         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 143         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 144
 145     def _fix_count(self, count):
 146         return int(str(count).replace(',', '')) if count is not None else None
 147
 148     def _extract_authors(self, html):
 149         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 150         if m is None:
 151             return None
 152         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 153
 154     def _extract_session_code(self, html):
 155         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 156         return m.group('code') if m is not None else None
 157
 158     def _extract_session_day(self, html):
 159         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 160         return m.group('day') if m is not None else None
 161
 162     def _extract_session_room(self, html):
 163         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 164         return m.group('room') if m is not None else None
 165
 166     def _extract_session_speakers(self, html):
 167         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 168
 169     def _extract_content(self, html, content_path):
 170         # Look for downloadable content
 171         formats = self._formats_from_html(html)
 172         slides = self._extract_slides(html)
 173         zip_ = self._extract_zip(html)
 174
 175         # Nothing to download
 176         if len(formats) == 0 and slides is None and zip_ is None:
 177             self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
 178             return
 179
 180         # Extract meta
 181         title = self._extract_title(html)
 182         description = self._extract_description(html)
 183         thumbnail = self._og_search_thumbnail(html)
 184         duration = self._extract_duration(html)
 185         avg_rating = self._extract_avg_rating(html)
 186         rating_count = self._extract_rating_count(html)
 187         view_count = self._extract_view_count(html)
 188         comment_count = self._extract_comment_count(html)
 189
 190         common = {'_type': 'video',
 191                   'id': content_path,
 192                   'description': description,
 193                   'thumbnail': thumbnail,
 194                   'duration': duration,
 195                   'avg_rating': avg_rating,
 196                   'rating_count': rating_count,
 197                   'view_count': view_count,
 198                   'comment_count': comment_count,
 199                 }
 200
 201         result = []
 202
 203         if slides is not None:
 204             d = common.copy()
 205             d.update({ 'title': title + '-Slides', 'url': slides })
 206             result.append(d)
 207
 208         if zip_ is not None:
 209             d = common.copy()
 210             d.update({ 'title': title + '-Zip', 'url': zip_ })
 211             result.append(d)
 212
 213         if len(formats) > 0:
 214             d = common.copy()
 215             d.update({ 'title': title, 'formats': formats })
 216             result.append(d)
 217
 218         return result
 219
 220     def _extract_entry_item(self, html, content_path):
 221         contents = self._extract_content(html, content_path)
 222         if contents is None:
 223             return contents
 224
 225         authors = self._extract_authors(html)
 226
 227         for content in contents:
 228             content['authors'] = authors
 229
 230         return contents
 231
 232     def _extract_session(self, html, content_path):
 233         contents = self._extract_content(html, content_path)
 234         if contents is None:
 235             return contents
 236
 237         session_meta = {'session_code': self._extract_session_code(html),
 238                         'session_day': self._extract_session_day(html),
 239                         'session_room': self._extract_session_room(html),
 240                         'session_speakers': self._extract_session_speakers(html),
 241                         }
 242
 243         for content in contents:
 244             content.update(session_meta)
 245
 246         return contents
 247
 248     def _extract_list(self, content_path):
 249         rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
 250         entries = [self.url_result(session_url.text, 'Channel9')
 251                    for session_url in rss.findall('./channel/item/link')]
 252         title_text = rss.find('./channel/title').text
 253         return self.playlist_result(entries, content_path, title_text)
 254
 255     def _real_extract(self, url):
 256         mobj = re.match(self._VALID_URL, url)
 257         content_path = mobj.group('contentpath')
 258
 259         webpage = self._download_webpage(url, content_path, 'Downloading web page')
 260
 261         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
 262         if page_type_m is None:
 263             raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
 264
 265         page_type = page_type_m.group('pagetype')
 266         if page_type == 'List':         # List page, may contain list of 'item'-like objects
 267             return self._extract_list(content_path)
 268         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
 269             return self._extract_entry_item(webpage, content_path)
 270         elif page_type == 'Session':    # Event session page, may contain downloadable content
 271             return self._extract_session(webpage, content_path)
 272         else:
 273             raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)