X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube-dl;h=1d7087a34f7ad81f3363659893d022d62c39d3be;hb=67035ede49520a3cab088315976d20d5dcbf2b26;hp=a1245a8b42cb5d14818e99fbb6e1264543590394;hpb=fedf9f390210d0a06f323f0476681b607ee57b0f;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index a1245a8b4..1d7087a34 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,14 +15,15 @@ __author__ = ( ) __license__ = 'Public Domain' -__version__ = '2011.09.06-phihag' +__version__ = '2011.09.18' -UPDATE_URL = 'https://raw.github.com/phihag/youtube-dl/master/youtube-dl' +UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' import cookielib import datetime import gzip import htmlentitydefs +import HTMLParser import httplib import locale import math @@ -65,8 +66,8 @@ except ImportError: try: import xml.etree.ElementTree -except ImportError: # Python<2.5 - pass # Not officially supported, but let it slip +except ImportError: # Python<2.5: Not officially supported, but let it slip + warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', @@ -437,6 +438,8 @@ class FileDownloader(object): noprogress: Do not print the progress bar. playliststart: Playlist item to start at. playlistend: Playlist item to end at. + matchtitle: Download only matching titles. + rejecttitle: Reject downloads for matching titles. logtostderr: Log messages to stderr instead of stdout. consoletitle: Display progress in console window's titlebar. nopart: Do not use temporary .part files. @@ -622,11 +625,12 @@ class FileDownloader(object): return filetime = timeconvert(timestr) if filetime is None: - return + return filetime try: os.utime(filename, (time.time(), filetime)) except: pass + return filetime def report_writedescription(self, descfn): """ Report that the description file is being written """ @@ -694,24 +698,38 @@ class FileDownloader(object): def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" filename = self.prepare_filename(info_dict) + + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceurl', False): + print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: + print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcedescription', False) and 'description' in info_dict: + print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forcefilename', False) and filename is not None: + print filename.encode(preferredencoding(), 'xmlcharrefreplace') + if self.params.get('forceformat', False): + print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') + # Do nothing else if in simulate mode if self.params.get('simulate', False): - # Forced printings - if self.params.get('forcetitle', False): - print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forceurl', False): - print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: - print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcedescription', False) and 'description' in info_dict: - print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') - if self.params.get('forcefilename', False) and filename is not None: - print filename.encode(preferredencoding(), 'xmlcharrefreplace') - return if filename is None: return + + matchtitle=self.params.get('matchtitle',False) + rejecttitle=self.params.get('rejecttitle',False) + title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') + if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): + self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle)) + return + if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): + self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle)) + return + if self.params.get('nooverwrites', False) and os.path.exists(filename): self.to_stderr(u'WARNING: file exists and will be skipped') return @@ -755,23 +773,24 @@ class FileDownloader(object): self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) return - try: - success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None)) - except (OSError, IOError), err: - raise UnavailableVideoError - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.trouble(u'ERROR: unable to download video data: %s' % str(err)) - return - except (ContentTooShortError, ), err: - self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) - return - - if success: + if not self.params.get('skip_download', False): try: - self.post_process(filename, info_dict) - except (PostProcessingError), err: - self.trouble(u'ERROR: postprocessing: %s' % str(err)) + success = self._do_download(filename, info_dict) + except (OSError, IOError), err: + raise UnavailableVideoError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble(u'ERROR: unable to download video data: %s' % str(err)) + return + except (ContentTooShortError, ), err: + self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) return + + if success: + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + self.trouble(u'ERROR: postprocessing: %s' % str(err)) + return def download(self, url_list): """Download a given list of URLs.""" @@ -822,7 +841,7 @@ class FileDownloader(object): # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] + basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) while retval == 2 or retval == 1: prevsize = os.path.getsize(tmpfilename) @@ -832,6 +851,11 @@ class FileDownloader(object): cursize = os.path.getsize(tmpfilename) if prevsize == cursize and retval == 1: break + # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those + if prevsize == cursize and retval == 2 and cursize > 1024: + self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = 0 + break if retval == 0: self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) self.try_rename(tmpfilename, filename) @@ -840,7 +864,10 @@ class FileDownloader(object): self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) return False - def _do_download(self, filename, url, player_url): + def _do_download(self, filename, info_dict): + url = info_dict['url'] + player_url = info_dict.get('player_url', None) + # Check file already present if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) @@ -852,7 +879,6 @@ class FileDownloader(object): tmpfilename = self.temp_name(filename) stream = None - open_mode = 'wb' # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -865,11 +891,14 @@ class FileDownloader(object): else: resume_len = 0 - # Request parameters in case of being able to resume - if self.params.get('continuedl', False) and resume_len != 0: - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' + open_mode = 'wb' + if resume_len != 0: + if self.params.get('continuedl', False): + self.report_resuming_byte(resume_len) + request.add_header('Range','bytes=%d-' % resume_len) + open_mode = 'ab' + else: + resume_len = 0 count = 0 retries = self.params.get('retries', 0) @@ -953,10 +982,13 @@ class FileDownloader(object): block_size = self.best_block_size(after - before, len(data_block)) # Progress message - percent_str = self.calc_percent(byte_counter, data_len) - eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) - self.report_progress(percent_str, data_len_str, speed_str, eta_str) + if data_len is None: + self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') + else: + percent_str = self.calc_percent(byte_counter, data_len) + eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + self.report_progress(percent_str, data_len_str, speed_str, eta_str) # Apply rate limit self.slow_down(start, byte_counter - resume_len) @@ -972,7 +1004,7 @@ class FileDownloader(object): # Update file modification time if self.params.get('updatetime', True): - self.try_utime(filename, data.info().get('last-modified', None)) + info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) return True @@ -1007,9 +1039,8 @@ class InfoExtractor(object): description: One-line video description. Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods, as well as the suitable() static method. - Probably, they should also be instantiated and added to the main - downloader. + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. """ _ready = False @@ -1020,10 +1051,9 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - @staticmethod - def suitable(url): + def suitable(self, url): """Receives a URL and returns True if suitable for this IE.""" - return False + return re.match(self._VALID_URL, url) is not None def initialize(self): """Initializes an instance (authentication, etc).""" @@ -1069,10 +1099,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '45': 'webm', } - - @staticmethod - def suitable(url): - return (re.match(YoutubeIE._VALID_URL, url) is not None) + IE_NAME = u'youtube' def report_lang(self): """Report attempt to set language.""" @@ -1299,16 +1326,24 @@ class YoutubeIE(InfoExtractor): if len(existing_formats) == 0: self._downloader.trouble(u'ERROR: no known formats available for video') return - if req_format is None: + if req_format is None or req_format == 'best': video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == '-1': + elif req_format == 'worst': + video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality + elif req_format in ('-1', 'all'): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: - # Specific format - if req_format not in url_map: + # Specific formats. We pick the first in a slash-delimeted sequence. + # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + req_formats = req_format.split('/') + video_url_list = None + for rf in req_formats: + if rf in url_map: + video_url_list = [(rf, url_map[rf])] + break + if video_url_list is None: self._downloader.trouble(u'ERROR: requested format not available') return - video_url_list = [(req_format, url_map[req_format])] # Specific format else: self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') return @@ -1346,15 +1381,12 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' _youtube_ie = None + IE_NAME = u'metacafe' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(MetacafeIE._VALID_URL, url) is not None) - def report_disclaimer(self): """Report disclaimer retrieval.""" self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') @@ -1488,14 +1520,11 @@ class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + IE_NAME = u'dailymotion' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(DailymotionIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) @@ -1523,6 +1552,7 @@ class DailymotionIE(InfoExtractor): # Retrieve video webpage to extract further information request = urllib2.Request(url) + request.add_header('Cookie', 'family_filter=off') try: self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() @@ -1532,25 +1562,29 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage) + mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') return - mediaURL = urllib.unquote(mobj.group(1)) + sequence = urllib.unquote(mobj.group(1)) + mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract media URL') + return + mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') # if needed add http://www.dailymotion.com/ if relative URL video_url = mediaURL - # '' - mobj = re.search(r'(?im)Dailymotion\s*[\-:]\s*(.+?)', webpage) + mobj = re.search(r'(?im)Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') video_title = sanitize_title(video_title) - mobj = re.search(r'(?im)(.+?)', webpage) + mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract uploader nickname') return @@ -1577,14 +1611,11 @@ class GoogleIE(InfoExtractor): """Information extractor for video.google.com.""" _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' + IE_NAME = u'video.google' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(GoogleIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id) @@ -1687,14 +1718,11 @@ class PhotobucketIE(InfoExtractor): """Information extractor for photobucket.com.""" _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' + IE_NAME = u'photobucket' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(PhotobucketIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) @@ -1772,14 +1800,11 @@ class YahooIE(InfoExtractor): # _VPAGE_URL matches only the extractable '/watch/' URLs _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' + IE_NAME = u'video.yahoo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(YahooIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) @@ -1917,7 +1942,6 @@ class YahooIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'thumbnail': video_thumbnail, - 'description': video_description, 'player_url': None, }) except UnavailableVideoError: @@ -1929,14 +1953,11 @@ class VimeoIE(InfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' + IE_NAME = u'vimeo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(VimeoIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id) @@ -2043,13 +2064,12 @@ class VimeoIE(InfoExtractor): class GenericIE(InfoExtractor): """Generic last-resort information extractor.""" + _VALID_URL = r'.*' + IE_NAME = u'generic' + def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return True - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') @@ -2143,21 +2163,18 @@ class GenericIE(InfoExtractor): class YoutubeSearchIE(InfoExtractor): """Information Extractor for YouTube search queries.""" - _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+' + _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'href="/watch\?v=.+?"' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None _max_youtube_results = 1000 + IE_NAME = u'youtube:search' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None) - def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) @@ -2167,7 +2184,7 @@ class YoutubeSearchIE(InfoExtractor): self._youtube_ie.initialize() def _real_extract(self, query): - mobj = re.match(self._VALID_QUERY, query) + mobj = re.match(self._VALID_URL, query) if mobj is None: self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) return @@ -2235,21 +2252,18 @@ class YoutubeSearchIE(InfoExtractor): class GoogleSearchIE(InfoExtractor): """Information Extractor for Google Video search queries.""" - _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+' + _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' _MORE_PAGES_INDICATOR = r'Next' _google_ie = None _max_google_results = 1000 + IE_NAME = u'video.google:search' def __init__(self, google_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._google_ie = google_ie - @staticmethod - def suitable(url): - return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None) - def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) @@ -2259,7 +2273,7 @@ class GoogleSearchIE(InfoExtractor): self._google_ie.initialize() def _real_extract(self, query): - mobj = re.match(self._VALID_QUERY, query) + mobj = re.match(self._VALID_URL, query) if mobj is None: self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) return @@ -2327,21 +2341,18 @@ class GoogleSearchIE(InfoExtractor): class YahooSearchIE(InfoExtractor): """Information Extractor for Yahoo! Video search queries.""" - _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+' + _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' _MORE_PAGES_INDICATOR = r'\s*Next' _yahoo_ie = None _max_yahoo_results = 1000 + IE_NAME = u'video.yahoo:search' def __init__(self, yahoo_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._yahoo_ie = yahoo_ie - @staticmethod - def suitable(url): - return (re.match(YahooSearchIE._VALID_QUERY, url) is not None) - def report_download_page(self, query, pagenum): """Report attempt to download playlist page with given number.""" query = query.decode(preferredencoding()) @@ -2351,7 +2362,7 @@ class YahooSearchIE(InfoExtractor): self._yahoo_ie.initialize() def _real_extract(self, query): - mobj = re.match(self._VALID_QUERY, query) + mobj = re.match(self._VALID_URL, query) if mobj is None: self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) return @@ -2425,15 +2436,12 @@ class YoutubePlaylistIE(InfoExtractor): _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None + IE_NAME = u'youtube:playlist' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None) - def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) @@ -2503,15 +2511,12 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _youtube_ie = None + IE_NAME = u'youtube:user' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie - @staticmethod - def suitable(url): - return (re.match(YoutubeUserIE._VALID_URL, url) is not None) - def report_download_page(self, username, start_index): """Report attempt to download user page.""" self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % @@ -2589,14 +2594,11 @@ class DepositFilesIE(InfoExtractor): """Information extractor for depositfiles.com""" _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)' + IE_NAME = u'DepositFiles' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(DepositFilesIE._VALID_URL, url) is not None) - def report_download_webpage(self, file_id): """Report webpage download.""" self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id) @@ -2676,14 +2678,11 @@ class FacebookIE(InfoExtractor): 'highqual': 'mp4', 'lowqual': 'mp4', } + IE_NAME = u'facebook' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(FacebookIE._VALID_URL, url) is not None) - def _reporter(self, message): """Add header and report message.""" self._downloader.to_screen(u'[facebook] %s' % message) @@ -2849,6 +2848,8 @@ class FacebookIE(InfoExtractor): return if req_format is None: video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality + elif req_format == 'worst': + video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality elif req_format == '-1': video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: @@ -2889,10 +2890,7 @@ class BlipTVIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' _URL_EXT = r'^.*\.([a-z0-9]+)$' - - @staticmethod - def suitable(url): - return (re.match(BlipTVIE._VALID_URL, url) is not None) + IE_NAME = u'blip.tv' def report_extraction(self, file_id): """Report information extraction.""" @@ -2964,14 +2962,11 @@ class MyVideoIE(InfoExtractor): """Information Extractor for myvideo.de.""" _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' + IE_NAME = u'myvideo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - @staticmethod - def suitable(url): - return (re.match(MyVideoIE._VALID_URL, url) is not None) - def report_download_webpage(self, video_id): """Report webpage download.""" self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id) @@ -3037,13 +3032,10 @@ class MyVideoIE(InfoExtractor): self._downloader.trouble(u'\nERROR: Unable to download video') class ComedyCentralIE(InfoExtractor): - """Information extractor for blip.tv""" - - _VALID_URL = r'^(?:https?://)?(www\.)?(thedailyshow|colbertnation)\.com/full-episodes/(.*)$' + """Information extractor for The Daily Show and Colbert Report """ - @staticmethod - def suitable(url): - return (re.match(ComedyCentralIE._VALID_URL, url) is not None) + _VALID_URL = r'^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?Pthedailyshow|colbertnation)\.com/full-episodes/(?P.*)$' + IE_NAME = u'comedycentral' def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -3051,6 +3043,9 @@ class ComedyCentralIE(InfoExtractor): def report_config_download(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id) + def report_index_download(self, episode_id): + self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id) + def report_player_url(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id) @@ -3064,36 +3059,72 @@ class ComedyCentralIE(InfoExtractor): if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - epTitle = mobj.group(3) + + if mobj.group('shortname'): + if mobj.group('shortname') in ('tds', 'thedailyshow'): + url = 'http://www.thedailyshow.com/full-episodes/' + else: + url = 'http://www.colbertnation.com/full-episodes/' + mobj = re.match(self._VALID_URL, url) + assert mobj is not None + + dlNewest = not mobj.group('episode') + if dlNewest: + epTitle = mobj.group('showname') + else: + epTitle = mobj.group('episode') req = urllib2.Request(url) self.report_extraction(epTitle) try: - html = urllib2.urlopen(req).read() + htmlHandle = urllib2.urlopen(req) + html = htmlHandle.read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err)) return + if dlNewest: + url = htmlHandle.geturl() + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url) + return + if mobj.group('episode') == '': + self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url) + return + epTitle = mobj.group('episode') - mMovieParams = re.findall('', html) + mMovieParams = re.findall('', html) if len(mMovieParams) == 0: self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url) return - ACT_COUNT = 4 - first_player_url = mMovieParams[0][0] - mediaNum = int(mMovieParams[0][2]) - ACT_COUNT - movieId = mMovieParams[0][1] - playerReq = urllib2.Request(first_player_url) + playerUrl_raw = mMovieParams[0][0] self.report_player_url(epTitle) try: - playerResponse = urllib2.urlopen(playerReq) + urlHandle = urllib2.urlopen(playerUrl_raw) + playerUrl = urlHandle.geturl() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err)) + return + + uri = mMovieParams[0][1] + indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri}) + self.report_index_download(epTitle) + try: + indexXml = urllib2.urlopen(indexUrl).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download player: %s' % unicode(err)) + self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err)) return - player_url = playerResponse.geturl() - for actNum in range(ACT_COUNT): - mediaId = movieId + str(mediaNum + actNum) + idoc = xml.etree.ElementTree.fromstring(indexXml) + itemEls = idoc.findall('.//item') + for itemEl in itemEls: + mediaId = itemEl.findall('./guid')[0].text + shortMediaId = mediaId.split(':')[-1] + showId = mediaId.split(':')[-2].replace('.com', '') + officialTitle = itemEl.findall('./title')[0].text + officialDate = itemEl.findall('./pubDate')[0].text + configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + urllib.urlencode({'uri': mediaId})) configReq = urllib2.Request(configUrl) @@ -3103,36 +3134,126 @@ class ComedyCentralIE(InfoExtractor): except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err)) return - + cdoc = xml.etree.ElementTree.fromstring(configXml) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) turls.append(finfo) + if len(turls) == 0: + self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found') + continue + # For now, just pick the highest bitrate format,video_url = turls[-1] self._downloader.increment_downloads() - actTitle = 'act' + str(actNum+1) + + effTitle = showId + '-' + epTitle info = { - 'id': epTitle, + 'id': shortMediaId, 'url': video_url, - 'uploader': 'NA', - 'upload_date': 'NA', - 'title': actTitle, - 'stitle': self._simplify_title(actTitle), + 'uploader': showId, + 'upload_date': officialDate, + 'title': effTitle, + 'stitle': self._simplify_title(effTitle), 'ext': 'mp4', 'format': format, 'thumbnail': None, - 'description': 'TODO: Not yet supported', - 'player_url': player_url + 'description': officialTitle, + 'player_url': playerUrl } - + try: self._downloader.process_info(info) except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download video') + self._downloader.trouble(u'\nERROR: unable to download ' + mediaId) + continue + + +class EscapistIE(InfoExtractor): + """Information extractor for The Escapist """ + + _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?].*$' + IE_NAME = u'escapist' + + def report_extraction(self, showName): + self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName) + + def report_config_download(self, showName): + self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) + + def _simplify_title(self, title): + res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title) + res = res.strip(ur'_') + return res + + def _real_extract(self, url): + htmlParser = HTMLParser.HTMLParser() + + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + showName = mobj.group('showname') + videoId = mobj.group('episode') + + self.report_extraction(showName) + try: + webPage = urllib2.urlopen(url).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err)) + return + + descMatch = re.search('