X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube-dl;h=20e26f99c06d7b70fda144c6a95a9b5655caa3a9;hb=b0eddb2eb4b8cad325ba3224c8550a21f67e4315;hp=bf6973480940cb5d3aaacbb571b2b8d92106b24f;hpb=79e75f66c88ee1de6018e518bcc1a33cd279f697;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index bf6973480..20e26f99c 100755 --- a/youtube-dl +++ b/youtube-dl @@ -18,8 +18,8 @@ import time import urllib import urllib2 -std_headers = { - 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5', +std_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'en-us,en;q=0.5', @@ -239,6 +239,48 @@ class FileDownloader(object): """Report download finished.""" self.to_stdout(u'') + def process_info(self, info_dict): + """Process a single dictionary returned by an InfoExtractor.""" + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'] + if self.params.get('forceurl', False): + print info_dict['url'] + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): + return 0 + + try: + filename = self.params['outtmpl'] % info_dict + self.report_destination(filename) + except (ValueError, KeyError), err: + return self.trouble('ERROR: invalid output template or system charset: %s' % str(err)) + if self.params['nooverwrites'] and os.path.exists(filename): + self.to_stderr('WARNING: file exists: %s; skipping' % filename) + return 0 + try: + self.pmkdir(filename) + except (OSError, IOError), err: + return self.trouble('ERROR: unable to create directories: %s' % str(err)) + try: + outstream = open(filename, 'wb') + except (OSError, IOError), err: + return self.trouble('ERROR: unable to open for writing: %s' % str(err)) + try: + self._do_download(outstream, info_dict['url']) + outstream.close() + except (OSError, IOError), err: + return self.trouble('ERROR: unable to write video data: %s' % str(err)) + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + return self.trouble('ERROR: unable to download video data: %s' % str(err)) + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + return self.trouble('ERROR: postprocessing: %s' % str(err)) + + return 0 + def download(self, url_list): """Download a given list of URLs.""" retcode = 0 @@ -248,64 +290,36 @@ class FileDownloader(object): for url in url_list: suitable_found = False for ie in self._ies: + # Go to next InfoExtractor if not suitable if not ie.suitable(url): continue + # Suitable InfoExtractor found suitable_found = True + + # Extract information from URL all_results = ie.extract(url) results = [x for x in all_results if x is not None] + + # See if there were problems extracting any information if len(results) != len(all_results): retcode = self.trouble() + # Two results could go to the same file if len(results) > 1 and self.fixed_template(): raise SameFileError(self.params['outtmpl']) + # Process each result for result in results: - # Forced printings - if self.params.get('forcetitle', False): - print result['title'] - if self.params.get('forceurl', False): - print result['url'] - - # Do nothing else if in simulate mode - if self.params.get('simulate', False): - continue - - try: - filename = self.params['outtmpl'] % result - self.report_destination(filename) - except (ValueError, KeyError), err: - retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err)) - continue - if self.params['nooverwrites'] and os.path.exists(filename): - self.to_stderr('WARNING: file exists: %s; skipping' % filename) - continue - try: - self.pmkdir(filename) - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to create directories: %s' % str(err)) - continue - try: - outstream = open(filename, 'wb') - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err)) - continue - try: - self._do_download(outstream, result['url']) - outstream.close() - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to write video data: %s' % str(err)) - continue - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - retcode = self.trouble('ERROR: unable to download video data: %s' % str(err)) - continue - try: - self.post_process(filename, result) - except (PostProcessingError), err: - retcode = self.trouble('ERROR: postprocessing: %s' % str(err)) - continue + result = self.process_info(result) + + # Do not overwrite an error code with a success code + if result != 0: + retcode = result + # Suitable InfoExtractor had been found; go to next URL break + if not suitable_found: retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url) @@ -435,6 +449,29 @@ class YoutubeIE(InfoExtractor): def suitable(url): return (re.match(YoutubeIE._VALID_URL, url) is not None) + @staticmethod + def htmlentity_transform(matchobj): + """Transforms an HTML entity to a Unicode character.""" + entity = matchobj.group(1) + + # Known non-numeric HTML entity + if entity in htmlentitydefs.name2codepoint: + return unichr(htmlentitydefs.name2codepoint[entity]) + + # Unicode character + mobj = re.match(ur'(?u)#(x?\d+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return unichr(long(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + def report_lang(self): """Report attempt to set language.""" self.to_stdout(u'[youtube] Setting language') @@ -458,7 +495,7 @@ class YoutubeIE(InfoExtractor): def report_video_url(self, video_id, video_real_url): """Report extracted video URL.""" self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url)) - + def _real_initialize(self): if self._downloader is None: return @@ -541,6 +578,8 @@ class YoutubeIE(InfoExtractor): if self._downloader is not None: params = self._downloader.params format_param = params.get('format', None) + if format_param is None: + format_param = '34' # Extension video_extension = { @@ -567,7 +606,7 @@ class YoutubeIE(InfoExtractor): if mobj is None: self.to_stderr(u'ERROR: unable to extract "t" parameter') return [None] - video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1)) + video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1)) if format_param is not None: video_real_url = '%s&fmt=%s' % (video_real_url, format_param) self.report_video_url(video_id, video_real_url) @@ -585,7 +624,7 @@ class YoutubeIE(InfoExtractor): self.to_stderr(u'ERROR: unable to extract video title') return [None] video_title = mobj.group(1).decode('utf-8') - video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title) + video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title) video_title = video_title.replace(os.sep, u'%') # simplified title @@ -728,8 +767,9 @@ class YoutubeSearchIE(InfoExtractor): _VIDEO_INDICATOR = r'href="/watch\?v=.+?"' _MORE_PAGES_INDICATOR = r'>Next' _youtube_ie = None + _max_youtube_results = 1000 - def __init__(self, youtube_ie, downloader=None): + def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie @@ -752,16 +792,19 @@ class YoutubeSearchIE(InfoExtractor): prefix, query = query.split(':') prefix = prefix[8:] - if prefix == '': + if prefix == '': return self._download_n_results(query, 1) - elif prefix == 'all': - return self._download_n_results(query, -1) - else: + elif prefix == 'all': + return self._download_n_results(query, self._max_youtube_results) + else: try: n = int(prefix) if n <= 0: self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return [None] + elif n > self._max_youtube_results: + self.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + n = self._max_youtube_results return self._download_n_results(query, n) except ValueError: # parsing prefix as int fails return self._download_n_results(query, 1)