youtube_dl/extractor/channel9.py

   1 # encoding: utf-8
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import ExtractorError
   7
   8 class Channel9IE(InfoExtractor):
   9     '''
  10     Common extractor for channel9.msdn.com.
  11
  12     The type of provided URL (video or playlist) is determined according to
  13     meta Search.PageType from web page HTML rather than URL itself, as it is
  14     not always possible to do.
  15     '''
  16     IE_DESC = u'Channel 9'
  17     IE_NAME = u'channel9'
  18     _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
  19
  20     _TESTS = [
  21         {
  22             u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  23             u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
  24             u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
  25             u'info_dict': {
  26                 u'title': u'Developer Kick-Off Session: Stuff We Love',
  27                 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
  28                 u'duration': 4576,
  29                 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
  30                 u'session_code': u'KOS002',
  31                 u'session_day': u'Day 1',
  32                 u'session_room': u'Arena 1A',
  33                 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
  34             },
  35         },
  36         {
  37             u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  38             u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
  39             u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
  40             u'info_dict': {
  41                 u'title': u'Self-service BI with Power BI - nuclear testing',
  42                 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  43                 u'duration': 1540,
  44                 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
  45                 u'authors': [ u'Mike Wilmot' ],
  46             },
  47         }
  48     ]
  49
  50     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  51
  52     # Sorted by quality
  53     _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
  54
  55     def _restore_bytes(self, formatted_size):
  56         if not formatted_size:
  57             return 0
  58         m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
  59         if not m:
  60             return 0
  61         units = m.group('units')
  62         try:
  63             exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
  64         except ValueError:
  65             return 0
  66         size = float(m.group('size'))
  67         return int(size * (1024 ** exponent))
  68
  69     def _formats_from_html(self, html):
  70         FORMAT_REGEX = r'''
  71             (?x)
  72             <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  73             <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  74             (?:<div\s+class="popup\s+rounded">\s*
  75             <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  76             </div>)?                                                # File size part may be missing
  77         '''
  78         # Extract known formats
  79         formats = [{'url': x.group('url'),
  80                  'format_id': x.group('quality'),
  81                  'format_note': x.group('note'),
  82                  'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  83                  'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
  84                  } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
  85         # Sort according to known formats list
  86         formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
  87         return formats
  88
  89     def _extract_title(self, html):
  90         title = self._html_search_meta(u'title', html, u'title')
  91         if title is None:
  92             title = self._og_search_title(html)
  93             TITLE_SUFFIX = u' (Channel 9)'
  94             if title is not None and title.endswith(TITLE_SUFFIX):
  95                 title = title[:-len(TITLE_SUFFIX)]
  96         return title
  97
  98     def _extract_description(self, html):
  99         DESCRIPTION_REGEX = r'''(?sx)
 100             <div\s+class="entry-content">\s*
 101             <div\s+id="entry-body">\s*
 102             (?P<description>.+?)\s*
 103             </div>\s*
 104             </div>
 105         '''
 106         m = re.search(DESCRIPTION_REGEX, html)
 107         if m is not None:
 108             return m.group('description')
 109         return self._html_search_meta(u'description', html, u'description')
 110
 111     def _extract_duration(self, html):
 112         m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
 113         return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
 114
 115     def _extract_slides(self, html):
 116         m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
 117         return m.group('slidesurl') if m is not None else None
 118
 119     def _extract_zip(self, html):
 120         m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
 121         return m.group('zipurl') if m is not None else None
 122
 123     def _extract_avg_rating(self, html):
 124         m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
 125         return float(m.group('avgrating')) if m is not None else 0
 126
 127     def _extract_rating_count(self, html):
 128         m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
 129         return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
 130
 131     def _extract_view_count(self, html):
 132         m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
 133         return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
 134
 135     def _extract_comment_count(self, html):
 136         m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
 137         return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
 138
 139     def _fix_count(self, count):
 140         return int(str(count).replace(',', '')) if count is not None else None
 141
 142     def _extract_authors(self, html):
 143         m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
 144         if m is None:
 145             return None
 146         return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
 147
 148     def _extract_session_code(self, html):
 149         m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
 150         return m.group('code') if m is not None else None
 151
 152     def _extract_session_day(self, html):
 153         m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
 154         return m.group('day') if m is not None else None
 155
 156     def _extract_session_room(self, html):
 157         m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
 158         return m.group('room') if m is not None else None
 159
 160     def _extract_session_speakers(self, html):
 161         return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
 162
 163     def _extract_content(self, html, content_path):
 164         # Look for downloadable content
 165         formats = self._formats_from_html(html)
 166         slides = self._extract_slides(html)
 167         zip_ = self._extract_zip(html)
 168
 169         # Nothing to download
 170         if len(formats) == 0 and slides is None and zip_ is None:
 171             self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
 172             return
 173
 174         # Extract meta
 175         title = self._extract_title(html)
 176         description = self._extract_description(html)
 177         thumbnail = self._og_search_thumbnail(html)
 178         duration = self._extract_duration(html)
 179         avg_rating = self._extract_avg_rating(html)
 180         rating_count = self._extract_rating_count(html)
 181         view_count = self._extract_view_count(html)
 182         comment_count = self._extract_comment_count(html)
 183
 184         common = {'_type': 'video',
 185                   'id': content_path,
 186                   'description': description,
 187                   'thumbnail': thumbnail,
 188                   'duration': duration,
 189                   'avg_rating': avg_rating,
 190                   'rating_count': rating_count,
 191                   'view_count': view_count,
 192                   'comment_count': comment_count,
 193                 }
 194
 195         result = []
 196
 197         if slides is not None:
 198             d = common.copy()
 199             d.update({ 'title': title + '-Slides', 'url': slides })
 200             result.append(d)
 201
 202         if zip_ is not None:
 203             d = common.copy()
 204             d.update({ 'title': title + '-Zip', 'url': zip_ })
 205             result.append(d)
 206
 207         if len(formats) > 0:
 208             d = common.copy()
 209             d.update({ 'title': title, 'formats': formats })
 210             result.append(d)
 211
 212         return result
 213
 214     def _extract_entry_item(self, html, content_path):
 215         contents = self._extract_content(html, content_path)
 216         if contents is None:
 217             return contents
 218
 219         authors = self._extract_authors(html)
 220
 221         for content in contents:
 222             content['authors'] = authors
 223
 224         return contents
 225
 226     def _extract_session(self, html, content_path):
 227         contents = self._extract_content(html, content_path)
 228         if contents is None:
 229             return contents
 230
 231         session_meta = {'session_code': self._extract_session_code(html),
 232                         'session_day': self._extract_session_day(html),
 233                         'session_room': self._extract_session_room(html),
 234                         'session_speakers': self._extract_session_speakers(html),
 235                         }
 236
 237         for content in contents:
 238             content.update(session_meta)
 239
 240         return contents
 241
 242     def _extract_list(self, content_path):
 243         rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
 244         entries = [self.url_result(session_url.text, 'Channel9')
 245                    for session_url in rss.findall('./channel/item/link')]
 246         title_text = rss.find('./channel/title').text
 247         return self.playlist_result(entries, content_path, title_text)
 248
 249     def _real_extract(self, url):
 250         mobj = re.match(self._VALID_URL, url)
 251         content_path = mobj.group('contentpath')
 252
 253         webpage = self._download_webpage(url, content_path, u'Downloading web page')
 254
 255         page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
 256         if page_type_m is None:
 257             raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
 258
 259         page_type = page_type_m.group('pagetype')
 260         if page_type == 'List':         # List page, may contain list of 'item'-like objects
 261             return self._extract_list(content_path)
 262         elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
 263             return self._extract_entry_item(webpage, content_path)
 264         elif page_type == 'Session':    # Event session page, may contain downloadable content
 265             return self._extract_session(webpage, content_path)
 266         else:
 267             raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)