youtube_dl/extractor/stanfordoc.py

   1 import re
   2 import socket
   3 import xml.etree.ElementTree
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     compat_http_client,
   8     compat_str,
   9     compat_urllib_error,
  10     compat_urllib_request,
  11
  12     ExtractorError,
  13     orderedSet,
  14     unescapeHTML,
  15 )
  16
  17
  18 class StanfordOpenClassroomIE(InfoExtractor):
  19     """Information extractor for Stanford's Open ClassRoom"""
  20
  21     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  22     IE_NAME = u'stanfordoc'
  23
  24     def _real_extract(self, url):
  25         mobj = re.match(self._VALID_URL, url)
  26         if mobj is None:
  27             raise ExtractorError(u'Invalid URL: %s' % url)
  28
  29         if mobj.group('course') and mobj.group('video'): # A specific video
  30             course = mobj.group('course')
  31             video = mobj.group('video')
  32             info = {
  33                 'id': course + '_' + video,
  34                 'uploader': None,
  35                 'upload_date': None,
  36             }
  37
  38             self.report_extraction(info['id'])
  39             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  40             xmlUrl = baseUrl + video + '.xml'
  41             try:
  42                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
  43             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  44                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
  45             mdoc = xml.etree.ElementTree.fromstring(metaXml)
  46             try:
  47                 info['title'] = mdoc.findall('./title')[0].text
  48                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  49             except IndexError:
  50                 raise ExtractorError(u'Invalid metadata XML file')
  51             info['ext'] = info['url'].rpartition('.')[2]
  52             return [info]
  53         elif mobj.group('course'): # A course page
  54             course = mobj.group('course')
  55             info = {
  56                 'id': course,
  57                 'type': 'playlist',
  58                 'uploader': None,
  59                 'upload_date': None,
  60             }
  61
  62             coursepage = self._download_webpage(url, info['id'],
  63                                         note='Downloading course info page',
  64                                         errnote='Unable to download course info page')
  65
  66             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  67
  68             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  69                 coursepage, u'description', fatal=False)
  70
  71             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  72             info['list'] = [
  73                 {
  74                     'type': 'reference',
  75                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  76                 }
  77                     for vpage in links]
  78             results = []
  79             for entry in info['list']:
  80                 assert entry['type'] == 'reference'
  81                 results += self.extract(entry['url'])
  82             return results
  83         else: # Root page
  84             info = {
  85                 'id': 'Stanford OpenClassroom',
  86                 'type': 'playlist',
  87                 'uploader': None,
  88                 'upload_date': None,
  89             }
  90
  91             self.report_download_webpage(info['id'])
  92             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  93             try:
  94                 rootpage = compat_urllib_request.urlopen(rootURL).read()
  95             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  96                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
  97
  98             info['title'] = info['id']
  99
 100             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 101             info['list'] = [
 102                 {
 103                     'type': 'reference',
 104                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 105                 }
 106                     for cpage in links]
 107
 108             results = []
 109             for entry in info['list']:
 110                 assert entry['type'] == 'reference'
 111                 results += self.extract(entry['url'])
 112             return results