1 from __future__ import unicode_literals
5 from .common import InfoExtractor
13 class Channel9IE(InfoExtractor):
15 Common extractor for channel9.msdn.com.
17 The type of provided URL (video or playlist) is determined according to
18 meta Search.PageType from web page HTML rather than URL itself, as it is
19 not always possible to do.
23 _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
26 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
27 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
29 'id': 'Events/TechEd/Australia/2013/KOS002',
31 'title': 'Developer Kick-Off Session: Stuff We Love',
32 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
34 'thumbnail': 're:http://.*\.jpg',
35 'session_code': 'KOS002',
36 'session_day': 'Day 1',
37 'session_room': 'Arena 1A',
38 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
42 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
43 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
45 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
47 'title': 'Self-service BI with Power BI - nuclear testing',
48 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
50 'thumbnail': 're:http://.*\.jpg',
51 'authors': ['Mike Wilmot'],
54 # low quality mp4 is best
55 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
57 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
59 'title': 'Ranges for the Standard Library',
60 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
62 'thumbnail': 're:http://.*\.jpg',
65 'skip_download': True,
68 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
70 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
75 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
76 'only_matching': True,
78 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
79 'only_matching': True,
82 _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
84 def _formats_from_html(self, html):
87 <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
88 <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
89 (?:<div\s+class="popup\s+rounded">\s*
90 <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
91 </div>)? # File size part may be missing
95 'Low Quality WMV', 'Low Quality MP4',
96 'Mid Quality WMV', 'Mid Quality MP4',
97 'High Quality WMV', 'High Quality MP4'))
99 'url': x.group('url'),
100 'format_id': x.group('quality'),
101 'format_note': x.group('note'),
102 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
103 'filesize_approx': parse_filesize(x.group('filesize')),
104 'quality': quality(x.group('quality')),
105 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
106 } for x in list(re.finditer(FORMAT_REGEX, html))]
108 self._sort_formats(formats)
112 def _extract_title(self, html):
113 title = self._html_search_meta('title', html, 'title')
115 title = self._og_search_title(html)
116 TITLE_SUFFIX = ' (Channel 9)'
117 if title is not None and title.endswith(TITLE_SUFFIX):
118 title = title[:-len(TITLE_SUFFIX)]
121 def _extract_description(self, html):
122 DESCRIPTION_REGEX = r'''(?sx)
123 <div\s+class="entry-content">\s*
124 <div\s+id="entry-body">\s*
125 (?P<description>.+?)\s*
129 m = re.search(DESCRIPTION_REGEX, html)
131 return m.group('description')
132 return self._html_search_meta('description', html, 'description')
134 def _extract_duration(self, html):
135 m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
136 return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
138 def _extract_slides(self, html):
139 m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
140 return m.group('slidesurl') if m is not None else None
142 def _extract_zip(self, html):
143 m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
144 return m.group('zipurl') if m is not None else None
146 def _extract_avg_rating(self, html):
147 m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
148 return float(m.group('avgrating')) if m is not None else 0
150 def _extract_rating_count(self, html):
151 m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
152 return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
154 def _extract_view_count(self, html):
155 m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
156 return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
158 def _extract_comment_count(self, html):
159 m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
160 return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
162 def _fix_count(self, count):
163 return int(str(count).replace(',', '')) if count is not None else None
165 def _extract_authors(self, html):
166 m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
169 return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
171 def _extract_session_code(self, html):
172 m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
173 return m.group('code') if m is not None else None
175 def _extract_session_day(self, html):
176 m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
177 return m.group('day').strip() if m is not None else None
179 def _extract_session_room(self, html):
180 m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
181 return m.group('room') if m is not None else None
183 def _extract_session_speakers(self, html):
184 return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
186 def _extract_content(self, html, content_path):
187 # Look for downloadable content
188 formats = self._formats_from_html(html)
189 slides = self._extract_slides(html)
190 zip_ = self._extract_zip(html)
192 # Nothing to download
193 if len(formats) == 0 and slides is None and zip_ is None:
194 self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
198 title = self._extract_title(html)
199 description = self._extract_description(html)
200 thumbnail = self._og_search_thumbnail(html)
201 duration = self._extract_duration(html)
202 avg_rating = self._extract_avg_rating(html)
203 rating_count = self._extract_rating_count(html)
204 view_count = self._extract_view_count(html)
205 comment_count = self._extract_comment_count(html)
210 'description': description,
211 'thumbnail': thumbnail,
212 'duration': duration,
213 'avg_rating': avg_rating,
214 'rating_count': rating_count,
215 'view_count': view_count,
216 'comment_count': comment_count,
221 if slides is not None:
223 d.update({'title': title + '-Slides', 'url': slides})
228 d.update({'title': title + '-Zip', 'url': zip_})
233 d.update({'title': title, 'formats': formats})
238 def _extract_entry_item(self, html, content_path):
239 contents = self._extract_content(html, content_path)
243 if len(contents) > 1:
244 raise ExtractorError('Got more than one entry')
246 result['authors'] = self._extract_authors(html)
250 def _extract_session(self, html, content_path):
251 contents = self._extract_content(html, content_path)
256 'session_code': self._extract_session_code(html),
257 'session_day': self._extract_session_day(html),
258 'session_room': self._extract_session_room(html),
259 'session_speakers': self._extract_session_speakers(html),
262 for content in contents:
263 content.update(session_meta)
265 return self.playlist_result(contents)
267 def _extract_list(self, video_id, rss_url=None):
269 rss_url = self._RSS_URL % video_id
270 rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
271 entries = [self.url_result(session_url.text, 'Channel9')
272 for session_url in rss.findall('./channel/item/link')]
273 title_text = rss.find('./channel/title').text
274 return self.playlist_result(entries, video_id, title_text)
276 def _real_extract(self, url):
277 mobj = re.match(self._VALID_URL, url)
278 content_path = mobj.group('contentpath')
279 rss = mobj.group('rss')
282 return self._extract_list(content_path, url)
284 webpage = self._download_webpage(
285 url, content_path, 'Downloading web page')
287 page_type = self._search_regex(
288 r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
289 webpage, 'page type', default=None, group='pagetype')
291 if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
292 return self._extract_entry_item(webpage, content_path)
293 elif page_type == 'Session': # Event session page, may contain downloadable content
294 return self._extract_session(webpage, content_path)
295 elif page_type == 'Event':
296 return self._extract_list(content_path)
298 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
299 else: # Assuming list
300 return self._extract_list(content_path)