X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube-dl;h=91ee099ceab577fb1514eaa52e633fbca9fa2306;hb=1c76e23e0fca5e16a1fc4f0baca58503ba7ec639;hp=f230d28fc8312f94314c3609fab3871c37634366;hpb=76800042fd781e2df03db6502aac709e5f72e65b;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index f230d28fc..91ee099ce 100755 --- a/youtube-dl +++ b/youtube-dl @@ -18,8 +18,8 @@ import time import urllib import urllib2 -std_headers = { - 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5', +std_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'en-us,en;q=0.5', @@ -52,6 +52,13 @@ class PostProcessingError(Exception): """ pass +class UnavailableFormatError(Exception): + """Unavailable Format exception. + + This exception will be thrown when a video is requested + in a format that is not available for that video. + """ + class FileDownloader(object): """File Downloader class. @@ -65,16 +72,17 @@ class FileDownloader(object): For this, file downloader objects have a method that allows InfoExtractors to be registered in a given order. When it is passed a URL, the file downloader handles it to the first InfoExtractor it - finds that reports being able to handle it. The InfoExtractor returns - all the information to the FileDownloader and the latter downloads the - file or does whatever it's instructed to do. + finds that reports being able to handle it. The InfoExtractor extracts + all the information about the video or videos the URL refers to, and + asks the FileDownloader to process the video information, possibly + downloading the video. File downloaders accept a lot of parameters. In order not to saturate the object constructor with arguments, it receives a dictionary of - options instead. These options are available through the get_params() - method for the InfoExtractors to use. The FileDownloader also registers - itself as the downloader in charge for the InfoExtractors that are - added to it, so this is a "mutual registration". + options instead. These options are available through the params + attribute for the InfoExtractors to use. The FileDownloader also + registers itself as the downloader in charge for the InfoExtractors + that are added to it, so this is a "mutual registration". Available options: @@ -92,15 +100,17 @@ class FileDownloader(object): nooverwrites: Prevent overwriting files. """ - _params = None + params = None _ies = [] _pps = [] + _download_retcode = None def __init__(self, params): """Create a FileDownloader object with the given options.""" self._ies = [] self._pps = [] - self.set_params(params) + self._download_retcode = 0 + self.params = params @staticmethod def pmkdir(filename): @@ -174,16 +184,6 @@ class FileDownloader(object): multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) return long(round(number * multiplier)) - def set_params(self, params): - """Sets parameters.""" - if type(params) != dict: - raise ValueError('params: dictionary expected') - self._params = params - - def get_params(self): - """Get parameters.""" - return self._params - def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) @@ -196,8 +196,8 @@ class FileDownloader(object): def to_stdout(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - if not self._params.get('quiet', False): - print u'%s%s' % (message, [u'\n', u''][skip_eol]), + if not self.params.get('quiet', False): + print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()), sys.stdout.flush() def to_stderr(self, message): @@ -206,26 +206,24 @@ class FileDownloader(object): def fixed_template(self): """Checks if the output template is fixed.""" - return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None) + return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None) def trouble(self, message=None): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore download errors or not, this method may throw an exception or - not when errors are found, after printing the message. If it - doesn't raise, it returns an error code suitable to be returned - later as a program exit code to indicate error. + not when errors are found, after printing the message. """ if message is not None: self.to_stderr(message) - if not self._params.get('ignoreerrors', False): + if not self.params.get('ignoreerrors', False): raise DownloadError(message) - return 1 + self._download_retcode = 1 def slow_down(self, start_time, byte_counter): """Sleep if the download speed is over the rate limit.""" - rate_limit = self._params.get('ratelimit', None) + rate_limit = self.params.get('ratelimit', None) if rate_limit is None or byte_counter == 0: return now = time.time() @@ -249,77 +247,80 @@ class FileDownloader(object): """Report download finished.""" self.to_stdout(u'') + def process_info(self, info_dict): + """Process a single dictionary returned by an InfoExtractor.""" + # Forced printings + if self.params.get('forcetitle', False): + print info_dict['title'].encode(locale.getpreferredencoding()) + if self.params.get('forceurl', False): + print info_dict['url'].encode(locale.getpreferredencoding()) + + # Do nothing else if in simulate mode + if self.params.get('simulate', False): + return + + try: + filename = self.params['outtmpl'] % info_dict + self.report_destination(filename) + except (ValueError, KeyError), err: + self.trouble('ERROR: invalid output template or system charset: %s' % str(err)) + if self.params['nooverwrites'] and os.path.exists(filename): + self.to_stderr('WARNING: file exists: %s; skipping' % filename) + return + + try: + self.pmkdir(filename) + except (OSError, IOError), err: + self.trouble('ERROR: unable to create directories: %s' % str(err)) + return + + try: + outstream = open(filename, 'wb') + except (OSError, IOError), err: + self.trouble('ERROR: unable to open for writing: %s' % str(err)) + return + + try: + self._do_download(outstream, info_dict['url']) + outstream.close() + except (OSError, IOError), err: + os.remove(filename) + raise UnavailableFormatError + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self.trouble('ERROR: unable to download video data: %s' % str(err)) + return + + try: + self.post_process(filename, info_dict) + except (PostProcessingError), err: + self.trouble('ERROR: postprocessing: %s' % str(err)) + return + def download(self, url_list): """Download a given list of URLs.""" - retcode = 0 if len(url_list) > 1 and self.fixed_template(): - raise SameFileError(self._params['outtmpl']) + raise SameFileError(self.params['outtmpl']) for url in url_list: suitable_found = False for ie in self._ies: + # Go to next InfoExtractor if not suitable if not ie.suitable(url): continue + # Suitable InfoExtractor found suitable_found = True - all_results = ie.extract(url) - results = [x for x in all_results if x is not None] - if len(results) != len(all_results): - retcode = self.trouble() - - if len(results) > 1 and self.fixed_template(): - raise SameFileError(self._params['outtmpl']) - - for result in results: - # Forced printings - if self._params.get('forcetitle', False): - print result['title'] - if self._params.get('forceurl', False): - print result['url'] - - # Do nothing else if in simulate mode - if self._params.get('simulate', False): - continue - try: - filename = self._params['outtmpl'] % result - self.report_destination(filename) - except (ValueError, KeyError), err: - retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err)) - continue - if self._params['nooverwrites'] and os.path.exists(filename): - self.to_stderr('WARNING: file exists: %s; skipping' % filename) - continue - try: - self.pmkdir(filename) - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to create directories: %s' % str(err)) - continue - try: - outstream = open(filename, 'wb') - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err)) - continue - try: - self._do_download(outstream, result['url']) - outstream.close() - except (OSError, IOError), err: - retcode = self.trouble('ERROR: unable to write video data: %s' % str(err)) - continue - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - retcode = self.trouble('ERROR: unable to download video data: %s' % str(err)) - continue - try: - self.post_process(filename, result) - except (PostProcessingError), err: - retcode = self.trouble('ERROR: postprocessing: %s' % str(err)) - continue + # Extract information from URL and process it + ie.extract(url) + # Suitable InfoExtractor had been found; go to next URL break + if not suitable_found: - retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url) + self.trouble('ERROR: no suitable InfoExtractor: %s' % url) - return retcode + return self._download_retcode def post_process(self, filename, ie_info): """Run the postprocessing chain on the given file.""" @@ -369,9 +370,10 @@ class InfoExtractor(object): Information extractors are the classes that, given a URL, extract information from the video (or videos) the URL refers to. This information includes the real video URL, the video title and simplified - title, author and others. It is returned in a list of dictionaries when - calling its extract() method. It is a list because a URL can refer to - more than one video (think of playlists). The dictionaries must include + title, author and others. The information is stored in a dictionary + which is then passed to the FileDownloader. The FileDownloader + processes this information possibly downloading the video to the file + system, among other possible outcomes. The dictionaries must include the following fields: id: Video identifier. @@ -415,15 +417,6 @@ class InfoExtractor(object): """Sets the downloader for this IE.""" self._downloader = downloader - def to_stdout(self, message): - """Print message to stdout if downloader is not in quiet mode.""" - if self._downloader is None or not self._downloader.get_params().get('quiet', False): - print message - - def to_stderr(self, message): - """Print message to stderr.""" - print >>sys.stderr, message - def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass @@ -440,42 +433,76 @@ class YoutubeIE(InfoExtractor): _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NETRC_MACHINE = 'youtube' + _available_formats = ['22', '18', '17', '13'] # listed in order of priority for -b flag + _video_extensions = { + '13': '3gp', + '17': 'mp4', + '18': 'mp4', + '22': 'mp4', + } @staticmethod def suitable(url): return (re.match(YoutubeIE._VALID_URL, url) is not None) + @staticmethod + def htmlentity_transform(matchobj): + """Transforms an HTML entity to a Unicode character.""" + entity = matchobj.group(1) + + # Known non-numeric HTML entity + if entity in htmlentitydefs.name2codepoint: + return unichr(htmlentitydefs.name2codepoint[entity]) + + # Unicode character + mobj = re.match(ur'(?u)#(x?\d+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return unichr(long(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + def report_lang(self): """Report attempt to set language.""" - self.to_stdout(u'[youtube] Setting language') + self._downloader.to_stdout(u'[youtube] Setting language') def report_login(self): """Report attempt to log in.""" - self.to_stdout(u'[youtube] Logging in') + self._downloader.to_stdout(u'[youtube] Logging in') def report_age_confirmation(self): """Report attempt to confirm age.""" - self.to_stdout(u'[youtube] Confirming age') + self._downloader.to_stdout(u'[youtube] Confirming age') def report_webpage_download(self, video_id): """Report attempt to download webpage.""" - self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id) + self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" - self.to_stdout(u'[youtube] %s: Extracting video information' % video_id) + self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id) def report_video_url(self, video_id, video_real_url): """Report extracted video URL.""" - self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url)) - + self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url)) + + def report_unavailable_format(self, video_id, format): + """Report extracted video URL.""" + self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format)) + def _real_initialize(self): if self._downloader is None: return username = None password = None - downloader_params = self._downloader.get_params() + downloader_params = self._downloader.params # Attempt to use provided username and password or .netrc data if downloader_params.get('username', None) is not None: @@ -490,7 +517,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError), err: - self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) return # Set language @@ -499,7 +526,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self.to_stderr(u'WARNING: unable to set language: %s' % str(err)) + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) return # No authentication to be performed @@ -519,10 +546,10 @@ class YoutubeIE(InfoExtractor): self.report_login() login_results = urllib2.urlopen(request).read() if re.search(r'(?i)