+class StanfordOpenClassroomIE(InfoExtractor):
+ """Information extractor for Stanford's Open ClassRoom"""
+
+ _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
+ IE_NAME = u'stanfordoc'
+
+ def report_download_webpage(self, objid):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
+
+ def report_extraction(self, video_id):
+ """Report information extraction."""
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ if mobj.group('course') and mobj.group('video'): # A specific video
+ course = mobj.group('course')
+ video = mobj.group('video')
+ info = {
+ 'id': _simplify_title(course + '_' + video),
+ }
+
+ self.report_extraction(info['id'])
+ baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
+ xmlUrl = baseUrl + video + '.xml'
+ try:
+ metaXml = urllib2.urlopen(xmlUrl).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
+ return
+ mdoc = xml.etree.ElementTree.fromstring(metaXml)
+ try:
+ info['title'] = mdoc.findall('./title')[0].text
+ info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
+ except IndexError:
+ self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
+ return
+ info['stitle'] = _simplify_title(info['title'])
+ info['ext'] = info['url'].rpartition('.')[2]
+ info['format'] = info['ext']
+ self._downloader.increment_downloads()
+ try:
+ self._downloader.process_info(info)
+ except UnavailableVideoError, err:
+ self._downloader.trouble(u'\nERROR: unable to download video')
+ elif mobj.group('course'): # A course page
+ unescapeHTML = HTMLParser.HTMLParser().unescape
+
+ course = mobj.group('course')
+ info = {
+ 'id': _simplify_title(course),
+ 'type': 'playlist',
+ }
+
+ self.report_download_webpage(info['id'])
+ try:
+ coursepage = urllib2.urlopen(url).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
+ return
+
+ m = re.search('<h1>([^<]+)</h1>', coursepage)
+ if m:
+ info['title'] = unescapeHTML(m.group(1))
+ else:
+ info['title'] = info['id']
+ info['stitle'] = _simplify_title(info['title'])
+
+ m = re.search('<description>([^<]+)</description>', coursepage)
+ if m:
+ info['description'] = unescapeHTML(m.group(1))
+
+ links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
+ info['list'] = [
+ {
+ 'type': 'reference',
+ 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
+ }
+ for vpage in links]
+
+ for entry in info['list']:
+ assert entry['type'] == 'reference'
+ self.extract(entry['url'])
+ else: # Root page
+ unescapeHTML = HTMLParser.HTMLParser().unescape
+
+ info = {
+ 'id': 'Stanford OpenClassroom',
+ 'type': 'playlist',
+ }
+
+ self.report_download_webpage(info['id'])
+ rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
+ try:
+ rootpage = urllib2.urlopen(rootURL).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
+ return
+
+ info['title'] = info['id']
+ info['stitle'] = _simplify_title(info['title'])
+
+ links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
+ info['list'] = [
+ {
+ 'type': 'reference',
+ 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
+ }
+ for cpage in links]
+
+ for entry in info['list']:
+ assert entry['type'] == 'reference'
+ self.extract(entry['url'])