from .extractor.generic import GenericIE
from .extractor.googleplus import GooglePlusIE
from .extractor.googlesearch import GoogleSearchIE
+from .extractor.infoq import InfoQIE
from .extractor.metacafe import MetacafeIE
+from .extractor.mtv import MTVIE
from .extractor.myvideo import MyVideoIE
+from .extractor.nba import NBAIE
from .extractor.statigram import StatigramIE
from .extractor.photobucket import PhotobucketIE
from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
+from .extractor.stanfordoc import StanfordOpenClassroomIE
from .extractor.vimeo import VimeoIE
+from .extractor.xvideos import XVideosIE
from .extractor.yahoo import YahooIE, YahooSearchIE
from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
from .extractor.zdf import ZDFIE
-class XVideosIE(InfoExtractor):
- """Information extractor for xvideos.com"""
-
- _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
- IE_NAME = u'xvideos'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group(1)
-
- webpage = self._download_webpage(url, video_id)
-
- self.report_extraction(video_id)
-
- # Extract video URL
- video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
- webpage, u'video URL'))
-
- # Extract title
- video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
- webpage, u'title')
-
- # Extract video thumbnail
- video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
- webpage, u'thumbnail', fatal=False)
-
- info = {
- 'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': 'flv',
- 'thumbnail': video_thumbnail,
- 'description': None,
- }
-
- return [info]
-
-
-
-
-class InfoQIE(InfoExtractor):
- """Information extractor for infoq.com"""
- _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- webpage = self._download_webpage(url, video_id=url)
- self.report_extraction(url)
-
- # Extract video URL
- mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video url')
- real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
- video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
-
- # Extract title
- video_title = self._search_regex(r'contentTitle = "(.*?)";',
- webpage, u'title')
-
- # Extract description
- video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
- webpage, u'description', fatal=False)
-
- video_filename = video_url.split('/')[-1]
- video_id, extension = video_filename.split('.')
-
- info = {
- 'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': extension, # Extension is always(?) mp4, but seems to be flv
- 'thumbnail': None,
- 'description': video_description,
- }
-
- return [info]
class MixcloudIE(InfoExtractor):
"""Information extractor for www.mixcloud.com"""
'player_url': player_url.decode('utf-8'),
}]
-class StanfordOpenClassroomIE(InfoExtractor):
- """Information extractor for Stanford's Open ClassRoom"""
- _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
- IE_NAME = u'stanfordoc'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- if mobj.group('course') and mobj.group('video'): # A specific video
- course = mobj.group('course')
- video = mobj.group('video')
- info = {
- 'id': course + '_' + video,
- 'uploader': None,
- 'upload_date': None,
- }
-
- self.report_extraction(info['id'])
- baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
- xmlUrl = baseUrl + video + '.xml'
- try:
- metaXml = compat_urllib_request.urlopen(xmlUrl).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
- try:
- info['title'] = mdoc.findall('./title')[0].text
- info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
- except IndexError:
- raise ExtractorError(u'Invalid metadata XML file')
- info['ext'] = info['url'].rpartition('.')[2]
- return [info]
- elif mobj.group('course'): # A course page
- course = mobj.group('course')
- info = {
- 'id': course,
- 'type': 'playlist',
- 'uploader': None,
- 'upload_date': None,
- }
-
- coursepage = self._download_webpage(url, info['id'],
- note='Downloading course info page',
- errnote='Unable to download course info page')
-
- info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
-
- info['description'] = self._html_search_regex('<description>([^<]+)</description>',
- coursepage, u'description', fatal=False)
-
- links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
- info['list'] = [
- {
- 'type': 'reference',
- 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
- }
- for vpage in links]
- results = []
- for entry in info['list']:
- assert entry['type'] == 'reference'
- results += self.extract(entry['url'])
- return results
- else: # Root page
- info = {
- 'id': 'Stanford OpenClassroom',
- 'type': 'playlist',
- 'uploader': None,
- 'upload_date': None,
- }
-
- self.report_download_webpage(info['id'])
- rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
- try:
- rootpage = compat_urllib_request.urlopen(rootURL).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
-
- info['title'] = info['id']
-
- links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
- info['list'] = [
- {
- 'type': 'reference',
- 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
- }
- for cpage in links]
-
- results = []
- for entry in info['list']:
- assert entry['type'] == 'reference'
- results += self.extract(entry['url'])
- return results
-
-class MTVIE(InfoExtractor):
- """Information extractor for MTV.com"""
-
- _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
- IE_NAME = u'mtv'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- if not mobj.group('proto'):
- url = 'http://' + url
- video_id = mobj.group('videoid')
-
- webpage = self._download_webpage(url, video_id)
-
- song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
- webpage, u'song name', fatal=False)
-
- video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
- webpage, u'title')
-
- mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
- webpage, u'mtvn_uri', fatal=False)
-
- content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
- webpage, u'content id', fatal=False)
-
- videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
- self.report_extraction(video_id)
- request = compat_urllib_request.Request(videogen_url)
- try:
- metadataXml = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
-
- mdoc = xml.etree.ElementTree.fromstring(metadataXml)
- renditions = mdoc.findall('.//rendition')
-
- # For now, always pick the highest quality.
- rendition = renditions[-1]
-
- try:
- _,_,ext = rendition.attrib['type'].partition('/')
- format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
- video_url = rendition.find('./src').text
- except KeyError:
- raise ExtractorError('Invalid rendition field.')
-
- info = {
- 'id': video_id,
- 'url': video_url,
- 'uploader': performer,
- 'upload_date': None,
- 'title': video_title,
- 'ext': ext,
- 'format': format,
- }
-
- return [info]
class YoukuIE(InfoExtractor):
-class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
- IE_NAME = u'nba'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- video_id = mobj.group(1)
-
- webpage = self._download_webpage(url, video_id)
-
- video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
-
- shortened_video_id = video_id.rpartition('/')[2]
- title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
- webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
-
- # It isn't there in the HTML it returns to us
- # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
-
- description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
-
- info = {
- 'id': shortened_video_id,
- 'url': video_url,
- 'ext': 'mp4',
- 'title': title,
- # 'uploader_date': uploader_date,
- 'description': description,
- }
- return [info]
class JustinTVIE(InfoExtractor):
"""Information extractor for justin.tv and twitch.tv"""