X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=fb10c2ec4aa04c0b14848ca75a028f7fb2d5b7ed;hb=934858ad86f5b628978d3bcdd7edd765d4590840;hp=acf11a96075ff875484c163eb20689e7ae32d535;hpb=b05654f0e36278c8e109e3ab591609275093d04e;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index acf11a960..fb10c2ec4 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - import base64 import datetime import itertools @@ -26,11 +21,13 @@ from .extractor.common import InfoExtractor, SearchInfoExtractor from .extractor.ard import ARDIE from .extractor.arte import ArteTvIE from .extractor.dailymotion import DailymotionIE +from .extractor.gametrailers import GametrailersIE +from .extractor.generic import GenericIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE from .extractor.photobucket import PhotobucketIE from .extractor.vimeo import VimeoIE -from .extractor.yahoo import YahooIE +from .extractor.yahoo import YahooIE, YahooSearchIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE from .extractor.zdf import ZDFIE @@ -44,212 +41,10 @@ from .extractor.zdf import ZDFIE -class GenericIE(InfoExtractor): - """Generic last-resort information extractor.""" - - _VALID_URL = r'.*' - IE_NAME = u'generic' - - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning(u'Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - - def report_following_redirect(self, new_url): - """Report information extraction.""" - self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - - def _test_redirect(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - class HeadRequest(compat_urllib_request.Request): - def get_method(self): - return "HEAD" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HeadRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return HeadRequest(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HeadRequest(url)) - if response is None: - raise ExtractorError(u'Invalid URL protocol') - new_url = response.geturl() - - if url == new_url: - return False - - self.report_following_redirect(new_url) - return new_url - - def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] - - video_id = url.split('/')[-1] - try: - webpage = self._download_webpage(url, video_id) - except ValueError as err: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError(u'Invalid URL: %s' % url) - - self.report_extraction(video_id) - # Start with something easy: JW Player in SWFObject - mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit - mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) - if mobj is None: - # Try to find twitter cards info - mobj = re.search(r'(.*)', - webpage, u'video title') - - # video uploader is domain name - video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', - url, u'video uploader') - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension, - }] - - - -class GoogleSearchIE(SearchInfoExtractor): - """Information Extractor for Google Video search queries.""" - _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"' - _MAX_RESULTS = 1000 - IE_NAME = u'video.google:search' - _SEARCH_KEY = 'gvsearch' - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - res = { - '_type': 'playlist', - 'id': query, - 'entries': [] - } - - for pagenum in itertools.count(1): - result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) - webpage = self._download_webpage(result_url, u'gvsearch:' + query, - note='Downloading result page ' + str(pagenum)) - - for mobj in re.finditer(r'

n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): - return res -class YahooSearchIE(SearchInfoExtractor): - """Information Extractor for Yahoo! Video search queries.""" - _MAX_RESULTS = 1000 - IE_NAME = u'screen.yahoo:search' - _SEARCH_KEY = 'yvsearch' - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - - res = { - '_type': 'playlist', - 'id': query, - 'entries': [] - } - for pagenum in itertools.count(0): - result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) - webpage = self._download_webpage(result_url, query, - note='Downloading results page '+str(pagenum+1)) - info = json.loads(webpage) - m = info[u'm'] - results = info[u'results'] - - for (i, r) in enumerate(results): - if (pagenum * 30) +i >= n: - break - mobj = re.search(r'(?Pscreen\.yahoo\.com/.*?-\d*?\.html)"', r) - e = self.url_result('http://' + mobj.group('url'), 'Yahoo') - res['entries'].append(e) - if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): - break - - return res class BlipTVUserIE(InfoExtractor): @@ -2910,56 +2705,6 @@ class Vbox7IE(InfoExtractor): 'thumbnail': thumbnail_url, }] -class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - video_type = mobj.group('type') - webpage = self._download_webpage(url, video_id) - if video_type == 'full-episodes': - mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' - else: - mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' - mgid = self._search_regex(mgid_re, webpage, u'mgid') - data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) - - info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, - video_id, u'Downloading video info') - links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, - video_id, u'Downloading video urls info') - - self.report_extraction(video_id) - info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .* - (?P.*?).* - ''' - - m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) - if m_info is None: - raise ExtractorError(u'Unable to extract video info') - video_title = m_info.group('title') - video_description = m_info.group('description') - video_thumb = m_info.group('thumb') - - m_urls = list(re.finditer(r'(?P.*)', links_webpage)) - if m_urls is None or len(m_urls) == 0: - raise ExtractError(u'Unable to extrat video url') - # They are sorted from worst to best quality - video_url = m_urls[-1].group('url') - - return {'url': video_url, - 'id': video_id, - 'title': video_title, - # Videos are actually flv not mp4 - 'ext': 'flv', - 'thumbnail': video_thumb, - 'description': video_description, - } def gen_extractors(): """ Return a list of an instance of every supported extractor.