youtube_dl/extractor/stanfordoc.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     orderedSet,
   7     unescapeHTML,
   8 )
   9
  10
  11 class StanfordOpenClassroomIE(InfoExtractor):
  12     IE_NAME = u'stanfordoc'
  13     IE_DESC = u'Stanford Open ClassRoom'
  14     _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  15     _TEST = {
  16         u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
  17         u'file': u'PracticalUnix_intro-environment.mp4',
  18         u'md5': u'544a9468546059d4e80d76265b0443b8',
  19         u'info_dict': {
  20             u"title": u"Intro Environment"
  21         }
  22     }
  23
  24     def _real_extract(self, url):
  25         mobj = re.match(self._VALID_URL, url)
  26         if mobj is None:
  27             raise ExtractorError(u'Invalid URL: %s' % url)
  28
  29         if mobj.group('course') and mobj.group('video'): # A specific video
  30             course = mobj.group('course')
  31             video = mobj.group('video')
  32             info = {
  33                 'id': course + '_' + video,
  34                 'uploader': None,
  35                 'upload_date': None,
  36             }
  37
  38             self.report_extraction(info['id'])
  39             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  40             xmlUrl = baseUrl + video + '.xml'
  41             mdoc = self._download_xml(xmlUrl, info['id'])
  42             try:
  43                 info['title'] = mdoc.findall('./title')[0].text
  44                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  45             except IndexError:
  46                 raise ExtractorError(u'Invalid metadata XML file')
  47             info['ext'] = info['url'].rpartition('.')[2]
  48             return [info]
  49         elif mobj.group('course'): # A course page
  50             course = mobj.group('course')
  51             info = {
  52                 'id': course,
  53                 'type': 'playlist',
  54                 'uploader': None,
  55                 'upload_date': None,
  56             }
  57
  58             coursepage = self._download_webpage(url, info['id'],
  59                                         note='Downloading course info page',
  60                                         errnote='Unable to download course info page')
  61
  62             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  63
  64             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  65                 coursepage, u'description', fatal=False)
  66
  67             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  68             info['list'] = [
  69                 {
  70                     'type': 'reference',
  71                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  72                 }
  73                     for vpage in links]
  74             results = []
  75             for entry in info['list']:
  76                 assert entry['type'] == 'reference'
  77                 results += self.extract(entry['url'])
  78             return results
  79         else: # Root page
  80             info = {
  81                 'id': 'Stanford OpenClassroom',
  82                 'type': 'playlist',
  83                 'uploader': None,
  84                 'upload_date': None,
  85             }
  86
  87             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  88             rootpage = self._download_webpage(rootURL, info['id'],
  89                 errnote=u'Unable to download course info page')
  90
  91             info['title'] = info['id']
  92
  93             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  94             info['list'] = [
  95                 {
  96                     'type': 'reference',
  97                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
  98                 }
  99                     for cpage in links]
 100
 101             results = []
 102             for entry in info['list']:
 103                 assert entry['type'] == 'reference'
 104                 results += self.extract(entry['url'])
 105             return results