X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=9d885d238882c99752dc8b49fc5c2f81ce3d9f20;hb=1736dec629bd3da25bea21145794adfa7a835ea3;hp=789fd147c5be52db272ed5626a69cac45f27b7b8;hpb=7beb36a52974a2cf7c8b00ba1da339f4f3de3abc;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 789fd147c..9d885d238 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -31,103 +31,22 @@ from .extractor.gametrailers import GametrailersIE from .extractor.generic import GenericIE from .extractor.googleplus import GooglePlusIE from .extractor.googlesearch import GoogleSearchIE +from .extractor.infoq import InfoQIE from .extractor.metacafe import MetacafeIE +from .extractor.mtv import MTVIE from .extractor.myvideo import MyVideoIE +from .extractor.nba import NBAIE from .extractor.statigram import StatigramIE from .extractor.photobucket import PhotobucketIE from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE +from .extractor.stanfordoc import StanfordOpenClassroomIE from .extractor.vimeo import VimeoIE +from .extractor.xvideos import XVideosIE from .extractor.yahoo import YahooIE, YahooSearchIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE from .extractor.zdf import ZDFIE -class XVideosIE(InfoExtractor): - """Information extractor for xvideos.com""" - - _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' - IE_NAME = u'xvideos' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) - - webpage = self._download_webpage(url, video_id) - - self.report_extraction(video_id) - - # Extract video URL - video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&', - webpage, u'video URL')) - - # Extract title - video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', - webpage, u'title') - - # Extract video thumbnail - video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', - webpage, u'thumbnail', fatal=False) - - info = { - 'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': 'flv', - 'thumbnail': video_thumbnail, - 'description': None, - } - - return [info] - - - - -class InfoQIE(InfoExtractor): - """Information extractor for infoq.com""" - _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - webpage = self._download_webpage(url, video_id=url) - self.report_extraction(url) - - # Extract video URL - mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) - video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id - - # Extract title - video_title = self._search_regex(r'contentTitle = "(.*?)";', - webpage, u'title') - - # Extract description - video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', - webpage, u'description', fatal=False) - - video_filename = video_url.split('/')[-1] - video_id, extension = video_filename.split('.') - - info = { - 'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': extension, # Extension is always(?) mp4, but seems to be flv - 'thumbnail': None, - 'description': video_description, - } - - return [info] class MixcloudIE(InfoExtractor): """Information extractor for www.mixcloud.com""" @@ -234,162 +153,7 @@ class MixcloudIE(InfoExtractor): 'player_url': player_url.decode('utf-8'), }] -class StanfordOpenClassroomIE(InfoExtractor): - """Information extractor for Stanford's Open ClassRoom""" - _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' - IE_NAME = u'stanfordoc' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - if mobj.group('course') and mobj.group('video'): # A specific video - course = mobj.group('course') - video = mobj.group('video') - info = { - 'id': course + '_' + video, - 'uploader': None, - 'upload_date': None, - } - - self.report_extraction(info['id']) - baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' - xmlUrl = baseUrl + video + '.xml' - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) - mdoc = xml.etree.ElementTree.fromstring(metaXml) - try: - info['title'] = mdoc.findall('./title')[0].text - info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text - except IndexError: - raise ExtractorError(u'Invalid metadata XML file') - info['ext'] = info['url'].rpartition('.')[2] - return [info] - elif mobj.group('course'): # A course page - course = mobj.group('course') - info = { - 'id': course, - 'type': 'playlist', - 'uploader': None, - 'upload_date': None, - } - - coursepage = self._download_webpage(url, info['id'], - note='Downloading course info page', - errnote='Unable to download course info page') - - info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - - info['description'] = self._html_search_regex('<description>([^<]+)</description>', - coursepage, u'description', fatal=False) - - links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) - info['list'] = [ - { - 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), - } - for vpage in links] - results = [] - for entry in info['list']: - assert entry['type'] == 'reference' - results += self.extract(entry['url']) - return results - else: # Root page - info = { - 'id': 'Stanford OpenClassroom', - 'type': 'playlist', - 'uploader': None, - 'upload_date': None, - } - - self.report_download_webpage(info['id']) - rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' - try: - rootpage = compat_urllib_request.urlopen(rootURL).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) - - info['title'] = info['id'] - - links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) - info['list'] = [ - { - 'type': 'reference', - 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), - } - for cpage in links] - - results = [] - for entry in info['list']: - assert entry['type'] == 'reference' - results += self.extract(entry['url']) - return results - -class MTVIE(InfoExtractor): - """Information extractor for MTV.com""" - - _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' - IE_NAME = u'mtv' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - if not mobj.group('proto'): - url = 'http://' + url - video_id = mobj.group('videoid') - - webpage = self._download_webpage(url, video_id) - - song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', - webpage, u'song name', fatal=False) - - video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', - webpage, u'title') - - mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', - webpage, u'mtvn_uri', fatal=False) - - content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', - webpage, u'content id', fatal=False) - - videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri - self.report_extraction(video_id) - request = compat_urllib_request.Request(videogen_url) - try: - metadataXml = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err)) - - mdoc = xml.etree.ElementTree.fromstring(metadataXml) - renditions = mdoc.findall('.//rendition') - - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') - - info = { - 'id': video_id, - 'url': video_url, - 'uploader': performer, - 'upload_date': None, - 'title': video_title, - 'ext': ext, - 'format': format, - } - - return [info] class YoukuIE(InfoExtractor): @@ -527,39 +291,6 @@ class XNXXIE(InfoExtractor): -class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' - IE_NAME = u'nba' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1) - - webpage = self._download_webpage(url, video_id) - - video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' - - shortened_video_id = video_id.rpartition('/')[2] - title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', - webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') - - # It isn't there in the HTML it returns to us - # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - - description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) - - info = { - 'id': shortened_video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - # 'uploader_date': uploader_date, - 'description': description, - } - return [info] class JustinTVIE(InfoExtractor): """Information extractor for justin.tv and twitch.tv"""