2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class DownloadError(Exception):
29 """Download Error exception.
31 This exception may be thrown by FileDownloader objects if they are not
32 configured to continue on errors. They will contain the appropriate
37 class SameFileError(Exception):
38 """Same File exception.
40 This exception will be thrown by FileDownloader objects if they detect
41 multiple files would have to be downloaded to the same file on disk.
45 class FileDownloader(object):
46 """File Downloader class.
48 File downloader objects are the ones responsible of downloading the
49 actual video file and writing it to disk if the user has requested
50 it, among some other tasks. In most cases there should be one per
51 program. As, given a video URL, the downloader doesn't know how to
52 extract all the needed information, task that InfoExtractors do, it
53 has to pass the URL to one of them.
55 For this, file downloader objects have a method that allows
56 InfoExtractors to be registered in a given order. When it is passed
57 a URL, the file downloader handles it to the first InfoExtractor it
58 finds that reports being able to handle it. The InfoExtractor returns
59 all the information to the FileDownloader and the latter downloads the
60 file or does whatever it's instructed to do.
62 File downloaders accept a lot of parameters. In order not to saturate
63 the object constructor with arguments, it receives a dictionary of
64 options instead. These options are available through the get_params()
65 method for the InfoExtractors to use. The FileDownloader also registers
66 itself as the downloader in charge for the InfoExtractors that are
67 added to it, so this is a "mutual registration".
71 username: Username for authentication purposes.
72 password: Password for authentication purposes.
73 usenetrc: Use netrc for authentication instead.
74 quiet: Do not print messages to stdout.
75 forceurl: Force printing final URL.
76 forcetitle: Force printing title.
77 simulate: Do not download the video files.
78 format: Video format code.
79 outtmpl: Template for output names.
80 ignoreerrors: Do not stop on download errors.
81 ratelimit: Download speed limit, in bytes/sec.
87 def __init__(self, params):
88 """Create a FileDownloader object with the given options."""
90 self.set_params(params)
94 """Create directory components in filename. Similar to Unix "mkdir -p"."""
95 components = filename.split(os.sep)
96 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
97 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
99 if not os.path.exists(dir):
103 def format_bytes(bytes):
109 exponent = long(math.log(float(bytes), 1024.0))
110 suffix = 'bkMGTPEZY'[exponent]
111 converted = float(bytes) / float(1024**exponent)
112 return '%.2f%s' % (converted, suffix)
115 def calc_percent(byte_counter, data_len):
118 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
121 def calc_eta(start, now, total, current):
125 if current == 0 or dif < 0.001: # One millisecond
127 rate = float(current) / dif
128 eta = long((float(total) - float(current)) / rate)
129 (eta_mins, eta_secs) = divmod(eta, 60)
132 return '%02d:%02d' % (eta_mins, eta_secs)
135 def calc_speed(start, now, bytes):
137 if bytes == 0 or dif < 0.001: # One millisecond
138 return '%10s' % '---b/s'
139 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
142 def best_block_size(elapsed_time, bytes):
143 new_min = max(bytes / 2.0, 1.0)
144 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
145 if elapsed_time < 0.001:
147 rate = bytes / elapsed_time
155 def parse_bytes(bytestr):
156 """Parse a string indicating a byte quantity into a long integer."""
157 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
160 number = float(matchobj.group(1))
161 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
162 return long(round(number * multiplier))
164 def set_params(self, params):
165 """Sets parameters."""
166 if type(params) != dict:
167 raise ValueError('params: dictionary expected')
168 self._params = params
170 def get_params(self):
171 """Get parameters."""
174 def add_info_extractor(self, ie):
175 """Add an InfoExtractor object to the end of the list."""
177 ie.set_downloader(self)
179 def to_stdout(self, message, skip_eol=False):
180 """Print message to stdout if not in quiet mode."""
181 if not self._params.get('quiet', False):
182 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
185 def to_stderr(self, message):
186 """Print message to stderr."""
187 sys.stderr.write('%s\n' % message)
189 def fixed_template(self):
190 """Checks if the output template is fixed."""
191 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
193 def trouble(self, message=None):
194 """Determine action to take when a download problem appears.
196 Depending on if the downloader has been configured to ignore
197 download errors or not, this method may throw an exception or
198 not when errors are found, after printing the message. If it
199 doesn't raise, it returns an error code suitable to be returned
200 later as a program exit code to indicate error.
202 if message is not None:
203 self.to_stderr(message)
204 if not self._params.get('ignoreerrors', False):
205 raise DownloadError(message)
208 def slow_down(self, start_time, byte_counter):
209 """Sleep if the download speed is over the rate limit."""
210 rate_limit = self._params.get('ratelimit', None)
211 if rate_limit is None or byte_counter == 0:
214 elapsed = now - start_time
217 speed = float(byte_counter) / elapsed
218 if speed > rate_limit:
219 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
221 def report_destination(self, filename):
222 """Report destination filename."""
223 self.to_stdout('[download] Destination: %s' % filename)
225 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
226 """Report download progress."""
227 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
228 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
230 def report_finish(self):
231 """Report download finished."""
234 def download(self, url_list):
235 """Download a given list of URLs."""
237 if len(url_list) > 1 and self.fixed_template():
238 raise SameFileError(self._params['outtmpl'])
241 suitable_found = False
243 if not ie.suitable(url):
245 # Suitable InfoExtractor found
246 suitable_found = True
247 all_results = ie.extract(url)
248 results = [x for x in all_results if x is not None]
249 if len(results) != len(all_results):
250 retcode = self.trouble()
252 if len(results) > 1 and self.fixed_template():
253 raise SameFileError(self._params['outtmpl'])
255 for result in results:
258 if self._params.get('forcetitle', False):
259 print result['title']
260 if self._params.get('forceurl', False):
263 # Do nothing else if in simulate mode
264 if self._params.get('simulate', False):
268 filename = self._params['outtmpl'] % result
269 self.report_destination(filename)
270 except (ValueError, KeyError), err:
271 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
274 self.pmkdir(filename)
275 except (OSError, IOError), err:
276 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
279 outstream = open(filename, 'wb')
280 except (OSError, IOError), err:
281 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
284 self._do_download(outstream, result['url'])
286 except (OSError, IOError), err:
287 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
289 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
290 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
293 if not suitable_found:
294 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
298 def _do_download(self, stream, url):
299 request = urllib2.Request(url, None, std_headers)
300 data = urllib2.urlopen(request)
301 data_len = data.info().get('Content-length', None)
302 data_len_str = self.format_bytes(data_len)
308 percent_str = self.calc_percent(byte_counter, data_len)
309 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
310 speed_str = self.calc_speed(start, time.time(), byte_counter)
311 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
315 data_block = data.read(block_size)
317 data_block_len = len(data_block)
318 if data_block_len == 0:
320 byte_counter += data_block_len
321 stream.write(data_block)
322 block_size = self.best_block_size(after - before, data_block_len)
325 self.slow_down(start, byte_counter)
328 if data_len is not None and str(byte_counter) != data_len:
329 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
331 class InfoExtractor(object):
332 """Information Extractor class.
334 Information extractors are the classes that, given a URL, extract
335 information from the video (or videos) the URL refers to. This
336 information includes the real video URL, the video title and simplified
337 title, author and others. It is returned in a list of dictionaries when
338 calling its extract() method. It is a list because a URL can refer to
339 more than one video (think of playlists). The dictionaries must include
340 the following fields:
342 id: Video identifier.
343 url: Final video URL.
344 uploader: Nickname of the video uploader.
345 title: Literal title.
346 stitle: Simplified title.
347 ext: Video filename extension.
349 Subclasses of this one should re-define the _real_initialize() and
350 _real_extract() methods, as well as the suitable() static method.
351 Probably, they should also be instantiated and added to the main
358 def __init__(self, downloader=None):
359 """Constructor. Receives an optional downloader."""
361 self.set_downloader(downloader)
365 """Receives a URL and returns True if suitable for this IE."""
368 def initialize(self):
369 """Initializes an instance (authentication, etc)."""
371 self._real_initialize()
374 def extract(self, url):
375 """Extracts URL information and returns it in list of dicts."""
377 return self._real_extract(url)
379 def set_downloader(self, downloader):
380 """Sets the downloader for this IE."""
381 self._downloader = downloader
383 def to_stdout(self, message):
384 """Print message to stdout if downloader is not in quiet mode."""
385 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
388 def to_stderr(self, message):
389 """Print message to stderr."""
390 sys.stderr.write('%s\n' % message)
392 def _real_initialize(self):
393 """Real initialization process. Redefine in subclasses."""
396 def _real_extract(self, url):
397 """Real extraction process. Redefine in subclasses."""
400 class YoutubeIE(InfoExtractor):
401 """Information extractor for youtube.com."""
403 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
404 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
405 _NETRC_MACHINE = 'youtube'
407 def report_login(self):
408 """Report attempt to log in."""
409 self.to_stdout('[youtube] Logging in')
411 def report_age_confirmation(self):
412 """Report attempt to confirm age."""
413 self.to_stdout('[youtube] Confirming age')
415 def report_webpage_download(self, video_id):
416 """Report attempt to download webpage."""
417 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
419 def report_information_extraction(self, video_id):
420 """Report attempt to extract video information."""
421 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
423 def report_video_url(self, video_id, video_real_url):
424 """Report extracted video URL."""
425 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
427 def _real_initialize(self):
428 if self._downloader is None:
433 downloader_params = self._downloader.get_params()
435 # Attempt to use provided username and password or .netrc data
436 if downloader_params.get('username', None) is not None:
437 username = downloader_params['username']
438 password = downloader_params['password']
439 elif downloader_params.get('usenetrc', False):
441 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
446 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
447 except (IOError, netrc.NetrcParseError), err:
448 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
451 # No authentication to be performed
457 'current_form': 'loginForm',
459 'action_login': 'Log In',
460 'username': username,
461 'password': password,
463 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
466 login_results = urllib2.urlopen(request).read()
467 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
468 self.to_stderr('WARNING: unable to log in: bad username or password')
470 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
471 self.to_stderr('WARNING: unable to log in: %s' % str(err))
477 'action_confirm': 'Confirm',
479 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
481 self.report_age_confirmation()
482 age_results = urllib2.urlopen(request).read()
483 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
484 self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
487 def _real_extract(self, url):
488 # Extract video id from URL
489 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
491 self.to_stderr('ERROR: invalid URL: %s' % url)
493 video_id = mobj.group(2)
495 # Downloader parameters
497 if self._downloader is not None:
498 params = self._downloader.get_params()
499 format_param = params.get('format', None)
502 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
504 # Normalize URL, including format
505 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
506 if format_param is not None:
507 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
508 request = urllib2.Request(normalized_url, None, std_headers)
510 self.report_webpage_download(video_id)
511 video_webpage = urllib2.urlopen(request).read()
512 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
513 self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
515 self.report_information_extraction(video_id)
518 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
520 self.to_stderr('ERROR: unable to extract "t" parameter')
522 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
523 if format_param is not None:
524 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
525 self.report_video_url(video_id, video_real_url)
528 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
530 self.to_stderr('ERROR: unable to extract uploader nickname')
532 video_uploader = mobj.group(1)
535 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
537 self.to_stderr('ERROR: unable to extract video title')
539 video_title = mobj.group(1).decode('utf-8')
540 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
541 video_title = video_title.replace(os.sep, u'%')
544 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
545 simple_title = simple_title.strip(ur'_')
550 'url': video_real_url,
551 'uploader': video_uploader,
552 'title': video_title,
553 'stitle': simple_title,
554 'ext': video_extension,
557 if __name__ == '__main__':
559 # Modules needed only when running the main program
563 # General configuration
564 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
565 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
566 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
569 parser = optparse.OptionParser(
570 usage='Usage: %prog [options] url...',
571 version='2008.07.22',
572 conflict_handler='resolve',
574 parser.add_option('-h', '--help',
575 action='help', help='print this help text and exit')
576 parser.add_option('-v', '--version',
577 action='version', help='print program version and exit')
578 parser.add_option('-u', '--username',
579 dest='username', metavar='UN', help='account username')
580 parser.add_option('-p', '--password',
581 dest='password', metavar='PW', help='account password')
582 parser.add_option('-o', '--output',
583 dest='outtmpl', metavar='TPL', help='output filename template')
584 parser.add_option('-q', '--quiet',
585 action='store_true', dest='quiet', help='activates quiet mode', default=False)
586 parser.add_option('-s', '--simulate',
587 action='store_true', dest='simulate', help='do not download video', default=False)
588 parser.add_option('-t', '--title',
589 action='store_true', dest='usetitle', help='use title in file name', default=False)
590 parser.add_option('-l', '--literal',
591 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
592 parser.add_option('-n', '--netrc',
593 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
594 parser.add_option('-g', '--get-url',
595 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
596 parser.add_option('-e', '--get-title',
597 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
598 parser.add_option('-f', '--format',
599 dest='format', metavar='FMT', help='video format code')
600 parser.add_option('-b', '--best-quality',
601 action='store_const', dest='format', help='alias for -f 18', const='18')
602 parser.add_option('-m', '--mobile-version',
603 action='store_const', dest='format', help='alias for -f 17', const='17')
604 parser.add_option('-i', '--ignore-errors',
605 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
606 parser.add_option('-r', '--rate-limit',
607 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
608 (opts, args) = parser.parse_args()
610 # Conflicting, missing and erroneous options
612 sys.exit('ERROR: you must provide at least one URL')
613 if opts.usenetrc and (opts.username is not None or opts.password is not None):
614 sys.exit('ERROR: using .netrc conflicts with giving username/password')
615 if opts.password is not None and opts.username is None:
616 sys.exit('ERROR: account username missing')
617 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
618 sys.exit('ERROR: using output template conflicts with using title or literal title')
619 if opts.usetitle and opts.useliteral:
620 sys.exit('ERROR: using title conflicts with using literal title')
621 if opts.username is not None and opts.password is None:
622 opts.password = getpass.getpass('Type account password and press return:')
623 if opts.ratelimit is not None:
624 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
625 if numeric_limit is None:
626 sys.exit('ERROR: invalid rate limit specified')
627 opts.ratelimit = numeric_limit
629 # Information extractors
630 youtube_ie = YoutubeIE()
633 fd = FileDownloader({
634 'usenetrc': opts.usenetrc,
635 'username': opts.username,
636 'password': opts.password,
637 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
638 'forceurl': opts.geturl,
639 'forcetitle': opts.gettitle,
640 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
641 'format': opts.format,
642 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
643 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
644 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
645 or '%(id)s.%(ext)s'),
646 'ignoreerrors': opts.ignoreerrors,
647 'ratelimit': opts.ratelimit,
649 fd.add_info_extractor(youtube_ie)
650 retcode = fd.download(args)
653 except DownloadError:
655 except SameFileError:
656 sys.exit('ERROR: fixed output name but more than one file to download')
657 except KeyboardInterrupt:
658 sys.exit('\nERROR: Interrupted by user')