X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=0fc39163ee2a74131ed84442c68f388a72fad0f1;hb=f36cd076850faf4b2859a168fcb740dfccb9eed6;hp=c9c563599ea1782b313fbb69dcd60a998fbe4583;hpb=d77c3dfd027e9af4d44fc7109fac0012451268c2;p=youtube-dl.git
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index c9c563599..0fc39163e 100644
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -12,29 +12,15 @@ import time
import urllib
import urllib2
import email.utils
+import xml.etree.ElementTree
+from urlparse import parse_qs
try:
import cStringIO as StringIO
except ImportError:
import StringIO
-# parse_qs was moved from the cgi module to the urlparse module recently.
-try:
- from urlparse import parse_qs
-except ImportError:
- from cgi import parse_qs
-
-try:
- import lxml.etree
-except ImportError:
- pass # Handled below
-
-try:
- import xml.etree.ElementTree
-except ImportError: # Python<2.5: Not officially supported, but let it slip
- warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
-
-from Utils import *
+from utils import *
class InfoExtractor(object):
@@ -53,7 +39,6 @@ class InfoExtractor(object):
url: Final video URL.
uploader: Nickname of the video uploader.
title: Literal title.
- stitle: Simplified title.
ext: Video filename extension.
format: Video format.
player_url: SWF Player URL (may be None).
@@ -117,8 +102,8 @@ class YoutubeIE(InfoExtractor):
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_NETRC_MACHINE = 'youtube'
# Listed in order of quality
- _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
- _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
+ _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
+ _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
_video_extensions = {
'13': '3gp',
'17': 'mp4',
@@ -129,6 +114,7 @@ class YoutubeIE(InfoExtractor):
'43': 'webm',
'44': 'webm',
'45': 'webm',
+ '46': 'webm',
}
_video_dimensions = {
'5': '240x400',
@@ -144,6 +130,7 @@ class YoutubeIE(InfoExtractor):
'43': '360x640',
'44': '480x854',
'45': '720x1280',
+ '46': '1080x1920',
}
IE_NAME = u'youtube'
@@ -193,8 +180,8 @@ class YoutubeIE(InfoExtractor):
end = start + float(dur)
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
- caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
- caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
+ caption = unescapeHTML(caption)
+ caption = unescapeHTML(caption) # double cycle, inentional
srt += str(n) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
@@ -339,10 +326,6 @@ class YoutubeIE(InfoExtractor):
return
video_title = urllib.unquote_plus(video_info['title'][0])
video_title = video_title.decode('utf-8')
- video_title = sanitize_title(video_title)
-
- # simplified title
- simple_title = simplify_title(video_title)
# thumbnail image
if 'thumbnail_url' not in video_info:
@@ -364,49 +347,39 @@ class YoutubeIE(InfoExtractor):
pass
# description
- try:
- lxml.etree
- except NameError:
- video_description = u'No description available.'
- mobj = re.search(r'', video_webpage)
- if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
- else:
- html_parser = lxml.etree.HTMLParser(encoding='utf-8')
- vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
- video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
- # TODO use another parser
+ video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
+ if video_description: video_description = clean_html(video_description)
+ else: video_description = ''
# closed captions
video_subtitles = None
if self._downloader.params.get('writesubtitles', False):
- self.report_video_subtitles_download(video_id)
- request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
- srt_list = urllib2.urlopen(request).read()
- except (urllib2.URLError, httplib.HTTPException, socket.error), err:
- self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
- else:
+ self.report_video_subtitles_download(video_id)
+ request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+ try:
+ srt_list = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
- if srt_lang_list:
- if self._downloader.params.get('subtitleslang', False):
- srt_lang = self._downloader.params.get('subtitleslang')
- elif 'en' in srt_lang_list:
- srt_lang = 'en'
- else:
- srt_lang = srt_lang_list[0]
- if not srt_lang in srt_lang_list:
- self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
- else:
- request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
- try:
- srt_xml = urllib2.urlopen(request).read()
- except (urllib2.URLError, httplib.HTTPException, socket.error), err:
- self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
- else:
- video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+ if not srt_lang_list:
+ raise Trouble(u'WARNING: video has no closed captions')
+ if self._downloader.params.get('subtitleslang', False):
+ srt_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in srt_lang_list:
+ srt_lang = 'en'
else:
- self._downloader.trouble(u'WARNING: video has no closed captions')
+ srt_lang = srt_lang_list[0]
+ if not srt_lang in srt_lang_list:
+ raise Trouble(u'WARNING: no closed captions found in the specified language')
+ request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+ try:
+ srt_xml = urllib2.urlopen(request).read()
+ except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+ video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+ except Trouble as trouble:
+ self._downloader.trouble(trouble[0])
# token
video_token = urllib.unquote_plus(video_info['token'][0])
@@ -458,31 +431,25 @@ class YoutubeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
return
+ results = []
for format_param, video_real_url in video_url_list:
- # At this point we have a new video
- self._downloader.increment_downloads()
-
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_real_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
- 'upload_date': upload_date,
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
- 'thumbnail': video_thumbnail.decode('utf-8'),
- 'description': video_description,
- 'player_url': player_url,
- 'subtitles': video_subtitles
- })
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ results.append({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_real_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description,
+ 'player_url': player_url,
+ 'subtitles': video_subtitles
+ })
+ return results
class MetacafeIE(InfoExtractor):
@@ -491,12 +458,10 @@ class MetacafeIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
- _youtube_ie = None
IE_NAME = u'metacafe'
- def __init__(self, youtube_ie, downloader=None):
+ def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- self._youtube_ie = youtube_ie
def report_disclaimer(self):
"""Report disclaimer retrieval."""
@@ -549,14 +514,9 @@ class MetacafeIE(InfoExtractor):
# Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
+ self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
return
- # At this point we have a new video
- self._downloader.increment_downloads()
-
- simple_title = mobj.group(2).decode('utf-8')
-
# Retrieve video webpage to extract further information
request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
try:
@@ -602,7 +562,6 @@ class MetacafeIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
- video_title = sanitize_title(video_title)
mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage)
if mobj is None:
@@ -610,21 +569,16 @@ class MetacafeIE(InfoExtractor):
return
video_uploader = mobj.group(1)
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class DailymotionIE(InfoExtractor):
@@ -651,8 +605,6 @@ class DailymotionIE(InfoExtractor):
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = mobj.group(1)
video_extension = 'flv'
@@ -689,8 +641,6 @@ class DailymotionIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
- video_title = sanitize_title(video_title)
- simple_title = simplify_title(video_title)
mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage)
if mobj is None:
@@ -698,21 +648,16 @@ class DailymotionIE(InfoExtractor):
return
video_uploader = mobj.group(1)
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class GoogleIE(InfoExtractor):
@@ -739,8 +684,6 @@ class GoogleIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = mobj.group(1)
video_extension = 'mp4'
@@ -774,8 +717,6 @@ class GoogleIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
- video_title = sanitize_title(video_title)
- simple_title = simplify_title(video_title)
# Extract video description
mobj = re.search(r'([^<]*)', webpage)
@@ -802,21 +743,16 @@ class GoogleIE(InfoExtractor):
else: # we need something to pass to process_info
video_thumbnail = ''
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': u'NA',
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': u'NA',
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class PhotobucketIE(InfoExtractor):
@@ -843,8 +779,6 @@ class PhotobucketIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = mobj.group(1)
video_extension = 'flv'
@@ -873,26 +807,19 @@ class PhotobucketIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
- video_title = sanitize_title(video_title)
- simple_title = simplify_title(video_title)
video_uploader = mobj.group(2).decode('utf-8')
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader,
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class YahooIE(InfoExtractor):
@@ -922,8 +849,6 @@ class YahooIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = mobj.group(2)
video_extension = 'flv'
@@ -968,7 +893,6 @@ class YahooIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = mobj.group(1).decode('utf-8')
- simple_title = simplify_title(video_title)
mobj = re.search(r'', webpage)
if mobj is None:
@@ -1026,25 +950,20 @@ class YahooIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Unable to extract media URL')
return
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
- video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
-
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url,
- 'uploader': video_uploader,
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'thumbnail': video_thumbnail.decode('utf-8'),
- 'description': video_description,
- 'thumbnail': video_thumbnail,
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ video_url = unescapeHTML(video_url)
+
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'player_url': None,
+ }]
class VimeoIE(InfoExtractor):
@@ -1072,8 +991,6 @@ class VimeoIE(InfoExtractor):
self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
return
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = mobj.group(1)
# Retrieve video webpage to extract further information
@@ -1100,7 +1017,6 @@ class VimeoIE(InfoExtractor):
# Extract title
video_title = config["video"]["title"]
- simple_title = simplify_title(video_title)
# Extract uploader
video_uploader = config["video"]["owner"]["name"]
@@ -1109,18 +1025,9 @@ class VimeoIE(InfoExtractor):
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
- try:
- lxml.etree
- except NameError:
- video_description = u'No description available.'
- mobj = re.search(r'', webpage, re.MULTILINE)
- if mobj is not None:
- video_description = mobj.group(1)
- else:
- html_parser = lxml.etree.HTMLParser()
- vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
- video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
- # TODO use another parser
+ video_description = get_element_by_id("description", webpage.decode('utf8'))
+ if video_description: video_description = clean_html(video_description)
+ else: video_description = ''
# Extract upload date
video_upload_date = u'NA'
@@ -1149,22 +1056,17 @@ class VimeoIE(InfoExtractor):
video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
%(video_id, sig, timestamp, quality, video_codec.upper())
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id,
- 'url': video_url,
- 'uploader': video_uploader,
- 'upload_date': video_upload_date,
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'ERROR: unable to download video')
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'player_url': None,
+ }]
class GenericIE(InfoExtractor):
@@ -1202,16 +1104,16 @@ class GenericIE(InfoExtractor):
"""
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
- newurl = newurl.replace(' ', '%20')
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
- return HeadRequest(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
+ newurl = newurl.replace(' ', '%20')
+ newheaders = dict((k,v) for k,v in req.headers.items()
+ if k.lower() not in ("content-length", "content-type"))
+ return HeadRequest(newurl,
+ headers=newheaders,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
else:
- raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
-
+ raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
+
class HTTPMethodFallback(urllib2.BaseHandler):
"""
Fallback to GET if HEAD is not allowed (405 HTTP error)
@@ -1221,17 +1123,17 @@ class GenericIE(InfoExtractor):
fp.close()
newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
+ if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True))
+ headers=newheaders,
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True))
# Build our opener
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
- HTTPMethodFallback, HEADRedirectHandler,
- urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
+ HTTPMethodFallback, HEADRedirectHandler,
+ urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
response = opener.open(HeadRequest(url))
@@ -1245,9 +1147,6 @@ class GenericIE(InfoExtractor):
def _real_extract(self, url):
if self._test_redirect(url): return
-
- # At this point we have a new video
- self._downloader.increment_downloads()
video_id = url.split('/')[-1]
request = urllib2.Request(url)
@@ -1297,8 +1196,6 @@ class GenericIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract title')
return
video_title = mobj.group(1).decode('utf-8')
- video_title = sanitize_title(video_title)
- simple_title = simplify_title(video_title)
# video uploader is domain name
mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
@@ -1307,43 +1204,33 @@ class GenericIE(InfoExtractor):
return
video_uploader = mobj.group(1).decode('utf-8')
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader,
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [{
+ 'id': video_id.decode('utf-8'),
+ 'url': video_url.decode('utf-8'),
+ 'uploader': video_uploader,
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class YoutubeSearchIE(InfoExtractor):
"""Information Extractor for YouTube search queries."""
_VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
- _youtube_ie = None
_max_youtube_results = 1000
IE_NAME = u'youtube:search'
- def __init__(self, youtube_ie, downloader=None):
+ def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- self._youtube_ie = youtube_ie
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
- def _real_initialize(self):
- self._youtube_ie.initialize()
-
def _real_extract(self, query):
mobj = re.match(self._VALID_URL, query)
if mobj is None:
@@ -1401,7 +1288,7 @@ class YoutubeSearchIE(InfoExtractor):
if len(video_ids) > n:
video_ids = video_ids[:n]
for id in video_ids:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
return
@@ -1411,22 +1298,17 @@ class GoogleSearchIE(InfoExtractor):
_TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
_VIDEO_INDICATOR = r'\s*Next\s*'
- _youtube_ie = None
IE_NAME = u'youtube:playlist'
- def __init__(self, youtube_ie, downloader=None):
+ def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- self._youtube_ie = youtube_ie
def report_download_page(self, playlist_id, pagenum):
"""Report attempt to download playlist page with given number."""
self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
- def _real_initialize(self):
- self._youtube_ie.initialize()
-
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url)
@@ -1611,7 +1483,7 @@ class YoutubePlaylistIE(InfoExtractor):
# Single video case
if mobj.group(3) is not None:
- self._youtube_ie.extract(mobj.group(3))
+ self._downloader.download([mobj.group(3)])
return
# Download playlist pages
@@ -1655,7 +1527,7 @@ class YoutubePlaylistIE(InfoExtractor):
video_ids = video_ids[playliststart:playlistend]
for id in video_ids:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
+ self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
return
@@ -1667,21 +1539,16 @@ class YoutubeUserIE(InfoExtractor):
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
- _youtube_ie = None
IE_NAME = u'youtube:user'
- def __init__(self, youtube_ie, downloader=None):
+ def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- self._youtube_ie = youtube_ie
def report_download_page(self, username, start_index):
"""Report attempt to download user page."""
self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
(username, start_index, start_index + self._GDATA_PAGE_SIZE))
- def _real_initialize(self):
- self._youtube_ie.initialize()
-
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -1744,7 +1611,7 @@ class YoutubeUserIE(InfoExtractor):
(username, all_ids_count, len(video_ids)))
for video_id in video_ids:
- self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
+ self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
class DepositFilesIE(InfoExtractor):
@@ -1765,9 +1632,6 @@ class DepositFilesIE(InfoExtractor):
self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
def _real_extract(self, url):
- # At this point we have a new file
- self._downloader.increment_downloads()
-
file_id = url.split('/')[-1]
# Rebuild url in english locale
url = 'http://depositfiles.com/en/files/' + file_id
@@ -1804,21 +1668,16 @@ class DepositFilesIE(InfoExtractor):
return
file_title = mobj.group(1).decode('utf-8')
- try:
- # Process file information
- self._downloader.process_info({
- 'id': file_id.decode('utf-8'),
- 'url': file_url.decode('utf-8'),
- 'uploader': u'NA',
- 'upload_date': u'NA',
- 'title': file_title,
- 'stitle': file_title,
- 'ext': file_extension.decode('utf-8'),
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError, err:
- self._downloader.trouble(u'ERROR: unable to download file')
+ return [{
+ 'id': file_id.decode('utf-8'),
+ 'url': file_url.decode('utf-8'),
+ 'uploader': u'NA',
+ 'upload_date': u'NA',
+ 'title': file_title,
+ 'ext': file_extension.decode('utf-8'),
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class FacebookIE(InfoExtractor):
@@ -1959,9 +1818,6 @@ class FacebookIE(InfoExtractor):
return
video_title = video_info['title']
video_title = video_title.decode('utf-8')
- video_title = sanitize_title(video_title)
-
- simple_title = simplify_title(video_title)
# thumbnail image
if 'thumbnail' not in video_info:
@@ -2011,31 +1867,24 @@ class FacebookIE(InfoExtractor):
return
video_url_list = [(req_format, url_map[req_format])] # Specific format
+ results = []
for format_param, video_real_url in video_url_list:
-
- # At this point we have a new video
- self._downloader.increment_downloads()
-
# Extension
video_extension = self._video_extensions.get(format_param, 'mp4')
- try:
- # Process video information
- self._downloader.process_info({
- 'id': video_id.decode('utf-8'),
- 'url': video_real_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
- 'upload_date': upload_date,
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': video_extension.decode('utf-8'),
- 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
- 'thumbnail': video_thumbnail.decode('utf-8'),
- 'description': video_description.decode('utf-8'),
- 'player_url': None,
- })
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ results.append({
+ 'id': video_id.decode('utf-8'),
+ 'url': video_real_url.decode('utf-8'),
+ 'uploader': video_uploader.decode('utf-8'),
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': video_extension.decode('utf-8'),
+ 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
+ 'thumbnail': video_thumbnail.decode('utf-8'),
+ 'description': video_description.decode('utf-8'),
+ 'player_url': None,
+ })
+ return results
class BlipTVIE(InfoExtractor):
"""Information extractor for blip.tv"""
@@ -2078,7 +1927,6 @@ class BlipTVIE(InfoExtractor):
'id': title,
'url': url,
'title': title,
- 'stitle': simplify_title(title),
'ext': ext,
'urlhandle': urlh
}
@@ -2098,21 +1946,20 @@ class BlipTVIE(InfoExtractor):
data = json_data['Post']
else:
data = json_data
-
+
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
video_url = data['media']['url']
umobj = re.match(self._URL_EXT, video_url)
if umobj is None:
raise ValueError('Can not determine filename extension')
ext = umobj.group(1)
-
+
info = {
'id': data['item_id'],
'url': video_url,
'uploader': data['display_name'],
'upload_date': upload_date,
'title': data['title'],
- 'stitle': simplify_title(data['title']),
'ext': ext,
'format': data['media']['mimeType'],
'thumbnail': data['thumbnailUrl'],
@@ -2123,12 +1970,7 @@ class BlipTVIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
return
- self._downloader.increment_downloads()
-
- try:
- self._downloader.process_info(info)
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download video')
+ return [info]
class MyVideoIE(InfoExtractor):
@@ -2179,24 +2021,17 @@ class MyVideoIE(InfoExtractor):
return
video_title = mobj.group(1)
- video_title = sanitize_title(video_title)
- simple_title = simplify_title(video_title)
-
- try:
- self._downloader.process_info({
- 'id': video_id,
- 'url': video_url,
- 'uploader': u'NA',
- 'upload_date': u'NA',
- 'title': video_title,
- 'stitle': simple_title,
- 'ext': u'flv',
- 'format': u'NA',
- 'player_url': None,
- })
- except UnavailableVideoError:
- self._downloader.trouble(u'\nERROR: Unable to download video')
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': u'NA',
+ 'upload_date': u'NA',
+ 'title': video_title,
+ 'ext': u'flv',
+ 'format': u'NA',
+ 'player_url': None,
+ }]
class ComedyCentralIE(InfoExtractor):
"""Information extractor for The Daily Show and Colbert Report """
@@ -2206,7 +2041,7 @@ class ComedyCentralIE(InfoExtractor):
def report_extraction(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
-
+
def report_config_download(self, episode_id):
self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
@@ -2278,6 +2113,8 @@ class ComedyCentralIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
return
+ results = []
+
idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
for itemEl in itemEls:
@@ -2310,8 +2147,6 @@ class ComedyCentralIE(InfoExtractor):
# For now, just pick the highest bitrate
format,video_url = turls[-1]
- self._downloader.increment_downloads()
-
effTitle = showId + u'-' + epTitle
info = {
'id': shortMediaId,
@@ -2319,7 +2154,6 @@ class ComedyCentralIE(InfoExtractor):
'uploader': showId,
'upload_date': officialDate,
'title': effTitle,
- 'stitle': simplify_title(effTitle),
'ext': 'mp4',
'format': format,
'thumbnail': None,
@@ -2327,11 +2161,9 @@ class ComedyCentralIE(InfoExtractor):
'player_url': playerUrl
}
- try:
- self._downloader.process_info(info)
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
- continue
+ results.append(info)
+
+ return results
class EscapistIE(InfoExtractor):
@@ -2347,8 +2179,6 @@ class EscapistIE(InfoExtractor):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -2358,17 +2188,18 @@ class EscapistIE(InfoExtractor):
self.report_extraction(showName)
try:
- webPage = urllib2.urlopen(url).read()
+ webPageBytes = urllib2.urlopen(url).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
return
+ webPage = webPageBytes.decode('utf-8')
descMatch = re.search('([^<]+)', coursepage)
if m:
@@ -2946,13 +2728,13 @@ class StanfordOpenClassroomIE(InfoExtractor):
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
}
for vpage in links]
-
+ results = []
for entry in info['list']:
assert entry['type'] == 'reference'
- self.extract(entry['url'])
+ results += self.extract(entry['url'])
+ return results
+
else: # Root page
- unescapeHTML = HTMLParser.HTMLParser().unescape
-
info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',
@@ -2967,7 +2749,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
return
info['title'] = info['id']
- info['stitle'] = simplify_title(info['title'])
links = orderedSet(re.findall('', rootpage))
info['list'] = [
@@ -2977,9 +2758,11 @@ class StanfordOpenClassroomIE(InfoExtractor):
}
for cpage in links]
+ results = []
for entry in info['list']:
assert entry['type'] == 'reference'
- self.extract(entry['url'])
+ results += self.extract(entry['url'])
+ return results
class MTVIE(InfoExtractor):
"""Information extractor for MTV.com"""
@@ -3059,18 +2842,13 @@ class MTVIE(InfoExtractor):
self._downloader.trouble('Invalid rendition field.')
return
- self._downloader.increment_downloads()
info = {
'id': video_id,
'url': video_url,
'uploader': performer,
'title': video_title,
- 'stitle': simplify_title(video_title),
'ext': ext,
'format': format,
}
- try:
- self._downloader.process_info(info)
- except UnavailableVideoError, err:
- self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
+ return [info]