Mark MTV as broken for now (#913)

[youtube-dl.git] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 789fd147c5be52db272ed5626a69cac45f27b7b8..9d885d238882c99752dc8b49fc5c2f81ce3d9f20 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -31,103 +31,22 @@ from .extractor.gametrailers import GametrailersIE
  from .extractor.generic import GenericIE
  from .extractor.googleplus import GooglePlusIE
  from .extractor.googlesearch import GoogleSearchIE
  from .extractor.generic import GenericIE
  from .extractor.googleplus import GooglePlusIE
  from .extractor.googlesearch import GoogleSearchIE
+from .extractor.infoq import InfoQIE
  from .extractor.metacafe import MetacafeIE
  from .extractor.metacafe import MetacafeIE
+from .extractor.mtv import MTVIE
  from .extractor.myvideo import MyVideoIE
  from .extractor.myvideo import MyVideoIE
+from .extractor.nba import NBAIE
  from .extractor.statigram import StatigramIE
  from .extractor.photobucket import PhotobucketIE
  from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  from .extractor.statigram import StatigramIE
  from .extractor.photobucket import PhotobucketIE
  from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
+from .extractor.stanfordoc import StanfordOpenClassroomIE
  from .extractor.vimeo import VimeoIE
  from .extractor.vimeo import VimeoIE
+from .extractor.xvideos import XVideosIE
  from .extractor.yahoo import YahooIE, YahooSearchIE
  from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  from .extractor.zdf import ZDFIE
  
  
  from .extractor.yahoo import YahooIE, YahooSearchIE
  from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  from .extractor.zdf import ZDFIE
  
  
-class XVideosIE(InfoExtractor):
-    """Information extractor for xvideos.com"""
-
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
-    IE_NAME = u'xvideos'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group(1)
-
-        webpage = self._download_webpage(url, video_id)
-
-        self.report_extraction(video_id)
-
-        # Extract video URL
-        video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
-            webpage, u'video URL'))
-
-        # Extract title
-        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
-            webpage, u'title')
-
-        # Extract video thumbnail
-        video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
-            webpage, u'thumbnail', fatal=False)
-
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'uploader': None,
-            'upload_date': None,
-            'title': video_title,
-            'ext': 'flv',
-            'thumbnail': video_thumbnail,
-            'description': None,
-        }
-
-        return [info]
-
-
-
-
-class InfoQIE(InfoExtractor):
-    """Information extractor for infoq.com"""
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        webpage = self._download_webpage(url, video_id=url)
-        self.report_extraction(url)
-
-        # Extract video URL
-        mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
-        video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
-
-        # Extract title
-        video_title = self._search_regex(r'contentTitle = "(.*?)";',
-            webpage, u'title')
-
-        # Extract description
-        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
-            webpage, u'description', fatal=False)
-
-        video_filename = video_url.split('/')[-1]
-        video_id, extension = video_filename.split('.')
-
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'uploader': None,
-            'upload_date': None,
-            'title': video_title,
-            'ext': extension, # Extension is always(?) mp4, but seems to be flv
-            'thumbnail': None,
-            'description': video_description,
-        }
-
-        return [info]
  
  class MixcloudIE(InfoExtractor):
      """Information extractor for www.mixcloud.com"""
  
  class MixcloudIE(InfoExtractor):
      """Information extractor for www.mixcloud.com"""
@@ -234,162 +153,7 @@ class MixcloudIE(InfoExtractor):
              'player_url': player_url.decode('utf-8'),
          }]
  
              'player_url': player_url.decode('utf-8'),
          }]
  
-class StanfordOpenClassroomIE(InfoExtractor):
-    """Information extractor for Stanford's Open ClassRoom"""
  
  
-    _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
-    IE_NAME = u'stanfordoc'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        if mobj.group('course') and mobj.group('video'): # A specific video
-            course = mobj.group('course')
-            video = mobj.group('video')
-            info = {
-                'id': course + '_' + video,
-                'uploader': None,
-                'upload_date': None,
-            }
-
-            self.report_extraction(info['id'])
-            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
-            xmlUrl = baseUrl + video + '.xml'
-            try:
-                metaXml = compat_urllib_request.urlopen(xmlUrl).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
-            mdoc = xml.etree.ElementTree.fromstring(metaXml)
-            try:
-                info['title'] = mdoc.findall('./title')[0].text
-                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
-            except IndexError:
-                raise ExtractorError(u'Invalid metadata XML file')
-            info['ext'] = info['url'].rpartition('.')[2]
-            return [info]
-        elif mobj.group('course'): # A course page
-            course = mobj.group('course')
-            info = {
-                'id': course,
-                'type': 'playlist',
-                'uploader': None,
-                'upload_date': None,
-            }
-
-            coursepage = self._download_webpage(url, info['id'],
-                                        note='Downloading course info page',
-                                        errnote='Unable to download course info page')
-
-            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
-
-            info['description'] = self._html_search_regex('<description>([^<]+)</description>',
-                coursepage, u'description', fatal=False)
-
-            links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
-            info['list'] = [
-                {
-                    'type': 'reference',
-                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
-                }
-                    for vpage in links]
-            results = []
-            for entry in info['list']:
-                assert entry['type'] == 'reference'
-                results += self.extract(entry['url'])
-            return results
-        else: # Root page
-            info = {
-                'id': 'Stanford OpenClassroom',
-                'type': 'playlist',
-                'uploader': None,
-                'upload_date': None,
-            }
-
-            self.report_download_webpage(info['id'])
-            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
-            try:
-                rootpage = compat_urllib_request.urlopen(rootURL).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
-
-            info['title'] = info['id']
-
-            links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
-            info['list'] = [
-                {
-                    'type': 'reference',
-                    'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
-                }
-                    for cpage in links]
-
-            results = []
-            for entry in info['list']:
-                assert entry['type'] == 'reference'
-                results += self.extract(entry['url'])
-            return results
-
-class MTVIE(InfoExtractor):
-    """Information extractor for MTV.com"""
-
-    _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
-    IE_NAME = u'mtv'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        if not mobj.group('proto'):
-            url = 'http://' + url
-        video_id = mobj.group('videoid')
-
-        webpage = self._download_webpage(url, video_id)
-
-        song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
-            webpage, u'song name', fatal=False)
-
-        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
-            webpage, u'title')
-
-        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
-            webpage, u'mtvn_uri', fatal=False)
-
-        content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
-            webpage, u'content id', fatal=False)
-
-        videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
-        self.report_extraction(video_id)
-        request = compat_urllib_request.Request(videogen_url)
-        try:
-            metadataXml = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
-
-        mdoc = xml.etree.ElementTree.fromstring(metadataXml)
-        renditions = mdoc.findall('.//rendition')
-
-        # For now, always pick the highest quality.
-        rendition = renditions[-1]
-
-        try:
-            _,_,ext = rendition.attrib['type'].partition('/')
-            format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
-            video_url = rendition.find('./src').text
-        except KeyError:
-            raise ExtractorError('Invalid rendition field.')
-
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'uploader': performer,
-            'upload_date': None,
-            'title': video_title,
-            'ext': ext,
-            'format': format,
-        }
-
-        return [info]
  
  
  class YoukuIE(InfoExtractor):
  
  
  class YoukuIE(InfoExtractor):
@@ -527,39 +291,6 @@ class XNXXIE(InfoExtractor):
  
  
  
  
  
  
-class NBAIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
-    IE_NAME = u'nba'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        video_id = mobj.group(1)
-
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
-
-        shortened_video_id = video_id.rpartition('/')[2]
-        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
-            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
-
-        # It isn't there in the HTML it returns to us
-        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
-
-        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
-
-        info = {
-            'id': shortened_video_id,
-            'url': video_url,
-            'ext': 'mp4',
-            'title': title,
-            # 'uploader_date': uploader_date,
-            'description': description,
-        }
-        return [info]
  
  class JustinTVIE(InfoExtractor):
      """Information extractor for justin.tv and twitch.tv"""
  
  class JustinTVIE(InfoExtractor):
      """Information extractor for justin.tv and twitch.tv"""