5 from .common import InfoExtractor
6 from ..utils import ExtractorError
8 class Channel9IE(InfoExtractor):
10 Common extractor for channel9.msdn.com.
12 The type of provided URL (video or playlist) is determined according to
13 meta Search.PageType from web page HTML rather than URL itself, as it is
14 not always possible to do.
16 IE_DESC = u'Channel 9'
18 _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
22 u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
23 u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
24 u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
26 u'title': u'Developer Kick-Off Session: Stuff We Love',
27 u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
29 u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
30 u'session_code': u'KOS002',
31 u'session_day': u'Day 1',
32 u'session_room': u'Arena 1A',
33 u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
37 u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
38 u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
39 u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
41 u'title': u'Self-service BI with Power BI - nuclear testing',
42 u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
44 u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
45 u'authors': [ u'Mike Wilmot' ],
50 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
53 _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
55 def _restore_bytes(self, formatted_size):
56 if not formatted_size:
58 m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
61 units = m.group('units')
63 exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
66 size = float(m.group('size'))
67 return int(size * (1024 ** exponent))
69 def _formats_from_html(self, html):
72 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
73 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
74 (?:<div\s+class="popup\s+rounded">\s*
75 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
76 </div>)? # File size part may be missing
78 # Extract known formats
79 formats = [{'url': x.group('url'),
80 'format_id': x.group('quality'),
81 'format_note': x.group('note'),
82 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
83 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
84 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
85 # Sort according to known formats list
86 formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
89 def _extract_title(self, html):
90 title = self._html_search_meta(u'title', html, u'title')
92 title = self._og_search_title(html)
93 TITLE_SUFFIX = u' (Channel 9)'
94 if title is not None and title.endswith(TITLE_SUFFIX):
95 title = title[:-len(TITLE_SUFFIX)]
98 def _extract_description(self, html):
99 DESCRIPTION_REGEX = r'''(?sx)
100 <div\s+class="entry-content">\s*
101 <div\s+id="entry-body">\s*
102 (?P<description>.+?)\s*
106 m = re.search(DESCRIPTION_REGEX, html)
108 return m.group('description')
109 return self._html_search_meta(u'description', html, u'description')
111 def _extract_duration(self, html):
112 m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
113 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
115 def _extract_slides(self, html):
116 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
117 return m.group('slidesurl') if m is not None else None
119 def _extract_zip(self, html):
120 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
121 return m.group('zipurl') if m is not None else None
123 def _extract_avg_rating(self, html):
124 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
125 return float(m.group('avgrating')) if m is not None else 0
127 def _extract_rating_count(self, html):
128 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
129 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
131 def _extract_view_count(self, html):
132 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
133 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
135 def _extract_comment_count(self, html):
136 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
137 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
139 def _fix_count(self, count):
140 return int(str(count).replace(',', '')) if count is not None else None
142 def _extract_authors(self, html):
143 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
146 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
148 def _extract_session_code(self, html):
149 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
150 return m.group('code') if m is not None else None
152 def _extract_session_day(self, html):
153 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
154 return m.group('day') if m is not None else None
156 def _extract_session_room(self, html):
157 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
158 return m.group('room') if m is not None else None
160 def _extract_session_speakers(self, html):
161 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
163 def _extract_content(self, html, content_path):
164 # Look for downloadable content
165 formats = self._formats_from_html(html)
166 slides = self._extract_slides(html)
167 zip_ = self._extract_zip(html)
169 # Nothing to download
170 if len(formats) == 0 and slides is None and zip_ is None:
171 self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
175 title = self._extract_title(html)
176 description = self._extract_description(html)
177 thumbnail = self._og_search_thumbnail(html)
178 duration = self._extract_duration(html)
179 avg_rating = self._extract_avg_rating(html)
180 rating_count = self._extract_rating_count(html)
181 view_count = self._extract_view_count(html)
182 comment_count = self._extract_comment_count(html)
184 common = {'_type': 'video',
186 'description': description,
187 'thumbnail': thumbnail,
188 'duration': duration,
189 'avg_rating': avg_rating,
190 'rating_count': rating_count,
191 'view_count': view_count,
192 'comment_count': comment_count,
197 if slides is not None:
199 d.update({ 'title': title + '-Slides', 'url': slides })
204 d.update({ 'title': title + '-Zip', 'url': zip_ })
209 d.update({ 'title': title, 'formats': formats })
214 def _extract_entry_item(self, html, content_path):
215 contents = self._extract_content(html, content_path)
219 authors = self._extract_authors(html)
221 for content in contents:
222 content['authors'] = authors
226 def _extract_session(self, html, content_path):
227 contents = self._extract_content(html, content_path)
231 session_meta = {'session_code': self._extract_session_code(html),
232 'session_day': self._extract_session_day(html),
233 'session_room': self._extract_session_room(html),
234 'session_speakers': self._extract_session_speakers(html),
237 for content in contents:
238 content.update(session_meta)
242 def _extract_list(self, content_path):
243 rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
244 entries = [self.url_result(session_url.text, 'Channel9')
245 for session_url in rss.findall('./channel/item/link')]
246 title_text = rss.find('./channel/title').text
247 return self.playlist_result(entries, content_path, title_text)
249 def _real_extract(self, url):
250 mobj = re.match(self._VALID_URL, url)
251 content_path = mobj.group('contentpath')
253 webpage = self._download_webpage(url, content_path, u'Downloading web page')
255 page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
256 if page_type_m is None:
257 raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
259 page_type = page_type_m.group('pagetype')
260 if page_type == 'List': # List page, may contain list of 'item'-like objects
261 return self._extract_list(content_path)
262 elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
263 return self._extract_entry_item(webpage, content_path)
264 elif page_type == 'Session': # Event session page, may contain downloadable content
265 return self._extract_session(webpage, content_path)
267 raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)