2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class DownloadError(Exception):
29 """Download Error exception.
31 This exception may be thrown by FileDownloader objects if they are not
32 configured to continue on errors. They will contain the appropriate
37 class SameFileError(Exception):
38 """Same File exception.
40 This exception will be thrown by FileDownloader objects if they detect
41 multiple files would have to be downloaded to the same file on disk.
45 class FileDownloader(object):
46 """File Downloader class.
48 File downloader objects are the ones responsible of downloading the
49 actual video file and writing it to disk if the user has requested
50 it, among some other tasks. In most cases there should be one per
51 program. As, given a video URL, the downloader doesn't know how to
52 extract all the needed information, task that InfoExtractors do, it
53 has to pass the URL to one of them.
55 For this, file downloader objects have a method that allows
56 InfoExtractors to be registered in a given order. When it is passed
57 a URL, the file downloader handles it to the first InfoExtractor it
58 finds that reports being able to handle it. The InfoExtractor returns
59 all the information to the FileDownloader and the latter downloads the
60 file or does whatever it's instructed to do.
62 File downloaders accept a lot of parameters. In order not to saturate
63 the object constructor with arguments, it receives a dictionary of
64 options instead. These options are available through the get_params()
65 method for the InfoExtractors to use. The FileDownloader also registers
66 itself as the downloader in charge for the InfoExtractors that are
67 added to it, so this is a "mutual registration".
71 username: Username for authentication purposes.
72 password: Password for authentication purposes.
73 usenetrc: Use netrc for authentication instead.
74 quiet: Do not print messages to stdout.
75 forceurl: Force printing final URL.
76 forcetitle: Force printing title.
77 simulate: Do not download the video files.
78 format: Video format code.
79 outtmpl: Template for output names.
80 ignoreerrors: Do not stop on download errors.
81 ratelimit: Download speed limit, in bytes/sec.
87 def __init__(self, params):
88 """Create a FileDownloader object with the given options."""
90 self.set_params(params)
94 """Create directory components in filename. Similar to Unix "mkdir -p"."""
95 components = filename.split(os.sep)
96 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
98 if not os.path.exists(dir):
102 def format_bytes(bytes):
108 exponent = long(math.log(float(bytes), 1024.0))
109 suffix = 'bkMGTPEZY'[exponent]
110 converted = float(bytes) / float(1024**exponent)
111 return '%.2f%s' % (converted, suffix)
114 def calc_percent(byte_counter, data_len):
117 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
120 def calc_eta(start, now, total, current):
124 if current == 0 or dif < 0.001: # One millisecond
126 rate = float(current) / dif
127 eta = long((float(total) - float(current)) / rate)
128 (eta_mins, eta_secs) = divmod(eta, 60)
131 return '%02d:%02d' % (eta_mins, eta_secs)
134 def calc_speed(start, now, bytes):
136 if bytes == 0 or dif < 0.001: # One millisecond
137 return '%10s' % '---b/s'
138 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
141 def best_block_size(elapsed_time, bytes):
142 new_min = max(bytes / 2.0, 1.0)
143 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
144 if elapsed_time < 0.001:
146 rate = bytes / elapsed_time
154 def parse_bytes(bytestr):
155 """Parse a string indicating a byte quantity into a long integer."""
156 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
159 number = float(matchobj.group(1))
160 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
161 return long(round(number * multiplier))
163 def set_params(self, params):
164 """Sets parameters."""
165 if type(params) != dict:
166 raise ValueError('params: dictionary expected')
167 self._params = params
169 def get_params(self):
170 """Get parameters."""
173 def add_info_extractor(self, ie):
174 """Add an InfoExtractor object to the end of the list."""
176 ie.set_downloader(self)
178 def to_stdout(self, message, skip_eol=False):
179 """Print message to stdout if not in quiet mode."""
180 if not self._params.get('quiet', False):
181 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
184 def to_stderr(self, message):
185 """Print message to stderr."""
186 sys.stderr.write('%s\n' % message)
188 def fixed_template(self):
189 """Checks if the output template is fixed."""
190 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
192 def trouble(self, message=None):
193 """Determine action to take when a download problem appears.
195 Depending on if the downloader has been configured to ignore
196 download errors or not, this method may throw an exception or
197 not when errors are found, after printing the message. If it
198 doesn't raise, it returns an error code suitable to be returned
199 later as a program exit code to indicate error.
201 if message is not None:
202 self.to_stderr(message)
203 if not self._params.get('ignoreerrors', False):
204 raise DownloadError(message)
207 def slow_down(self, start_time, byte_counter):
208 """Sleep if the download speed is over the rate limit."""
209 rate_limit = self._params.get('ratelimit', None)
210 if rate_limit is None or byte_counter == 0:
213 elapsed = now - start_time
216 speed = float(byte_counter) / elapsed
217 if speed > rate_limit:
218 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
220 def report_destination(self, filename):
221 """Report destination filename."""
222 self.to_stdout('[download] Destination: %s' % filename)
224 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
225 """Report download progress."""
226 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
227 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
229 def report_finish(self):
230 """Report download finished."""
233 def download(self, url_list):
234 """Download a given list of URLs."""
236 if len(url_list) > 1 and self.fixed_template():
237 raise SameFileError(self._params['outtmpl'])
240 suitable_found = False
242 if not ie.suitable(url):
244 # Suitable InfoExtractor found
245 suitable_found = True
246 all_results = ie.extract(url)
247 results = [x for x in all_results if x is not None]
248 if len(results) != len(all_results):
249 retcode = self.trouble()
251 if len(results) > 1 and self.fixed_template():
252 raise SameFileError(self._params['outtmpl'])
254 for result in results:
257 if self._params.get('forcetitle', False):
258 print result['title']
259 if self._params.get('forceurl', False):
262 # Do nothing else if in simulate mode
263 if self._params.get('simulate', False):
267 filename = self._params['outtmpl'] % result
268 self.report_destination(filename)
269 except (ValueError, KeyError), err:
270 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
273 self.pmkdir(filename)
274 except (OSError, IOError), err:
275 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
278 outstream = open(filename, 'wb')
279 except (OSError, IOError), err:
280 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
283 self._do_download(outstream, result['url'])
285 except (OSError, IOError), err:
286 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
289 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
292 if not suitable_found:
293 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
297 def _do_download(self, stream, url):
298 request = urllib2.Request(url, None, std_headers)
299 data = urllib2.urlopen(request)
300 data_len = data.info().get('Content-length', None)
301 data_len_str = self.format_bytes(data_len)
307 percent_str = self.calc_percent(byte_counter, data_len)
308 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
309 speed_str = self.calc_speed(start, time.time(), byte_counter)
310 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
314 data_block = data.read(block_size)
316 data_block_len = len(data_block)
317 if data_block_len == 0:
319 byte_counter += data_block_len
320 stream.write(data_block)
321 block_size = self.best_block_size(after - before, data_block_len)
324 self.slow_down(start, byte_counter)
327 if data_len is not None and str(byte_counter) != data_len:
328 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
330 class InfoExtractor(object):
331 """Information Extractor class.
333 Information extractors are the classes that, given a URL, extract
334 information from the video (or videos) the URL refers to. This
335 information includes the real video URL, the video title and simplified
336 title, author and others. It is returned in a list of dictionaries when
337 calling its extract() method. It is a list because a URL can refer to
338 more than one video (think of playlists). The dictionaries must include
339 the following fields:
341 id: Video identifier.
342 url: Final video URL.
343 uploader: Nickname of the video uploader.
344 title: Literal title.
345 stitle: Simplified title.
346 ext: Video filename extension.
348 Subclasses of this one should re-define the _real_initialize() and
349 _real_extract() methods, as well as the suitable() static method.
350 Probably, they should also be instantiated and added to the main
357 def __init__(self, downloader=None):
358 """Constructor. Receives an optional downloader."""
360 self.set_downloader(downloader)
364 """Receives a URL and returns True if suitable for this IE."""
367 def initialize(self):
368 """Initializes an instance (authentication, etc)."""
370 self._real_initialize()
373 def extract(self, url):
374 """Extracts URL information and returns it in list of dicts."""
376 return self._real_extract(url)
378 def set_downloader(self, downloader):
379 """Sets the downloader for this IE."""
380 self._downloader = downloader
382 def to_stdout(self, message):
383 """Print message to stdout if downloader is not in quiet mode."""
384 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
387 def to_stderr(self, message):
388 """Print message to stderr."""
389 sys.stderr.write('%s\n' % message)
391 def _real_initialize(self):
392 """Real initialization process. Redefine in subclasses."""
395 def _real_extract(self, url):
396 """Real extraction process. Redefine in subclasses."""
399 class YoutubeIE(InfoExtractor):
400 """Information extractor for youtube.com."""
402 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
403 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
404 _NETRC_MACHINE = 'youtube'
406 def report_login(self):
407 """Report attempt to log in."""
408 self.to_stdout('[youtube] Logging in')
410 def report_age_confirmation(self):
411 """Report attempt to confirm age."""
412 self.to_stdout('[youtube] Confirming age')
414 def report_webpage_download(self, video_id):
415 """Report attempt to download webpage."""
416 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
418 def report_information_extraction(self, video_id):
419 """Report attempt to extract video information."""
420 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
422 def report_video_url(self, video_id, video_real_url):
423 """Report extracted video URL."""
424 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
426 def _real_initialize(self):
427 if self._downloader is None:
432 downloader_params = self._downloader.get_params()
434 # Attempt to use provided username and password or .netrc data
435 if downloader_params.get('username', None) is not None:
436 username = downloader_params['username']
437 password = downloader_params['password']
438 elif downloader_params.get('usenetrc', False):
440 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
445 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
446 except (IOError, netrc.NetrcParseError), err:
447 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
450 # No authentication to be performed
456 'current_form': 'loginForm',
458 'action_login': 'Log In',
459 'username': username,
460 'password': password,
462 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
465 login_results = urllib2.urlopen(request).read()
466 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
467 self.to_stderr('WARNING: unable to log in: bad username or password')
469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
470 self.to_stderr('WARNING: unable to log in: %s' % str(err))
476 'action_confirm': 'Confirm',
478 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
480 self.report_age_confirmation()
481 age_results = urllib2.urlopen(request).read()
482 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
483 self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
486 def _real_extract(self, url):
487 # Extract video id from URL
488 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
490 self.to_stderr('ERROR: invalid URL: %s' % url)
492 video_id = mobj.group(2)
494 # Downloader parameters
496 if self._downloader is not None:
497 params = self._downloader.get_params()
498 format_param = params.get('format', None)
501 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
503 # Normalize URL, including format
504 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
505 if format_param is not None:
506 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
507 request = urllib2.Request(normalized_url, None, std_headers)
509 self.report_webpage_download(video_id)
510 video_webpage = urllib2.urlopen(request).read()
511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
512 self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
514 self.report_information_extraction(video_id)
517 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
519 self.to_stderr('ERROR: unable to extract "t" parameter')
521 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
522 if format_param is not None:
523 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
524 self.report_video_url(video_id, video_real_url)
527 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
529 self.to_stderr('ERROR: unable to extract uploader nickname')
531 video_uploader = mobj.group(1)
534 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
536 self.to_stderr('ERROR: unable to extract video title')
538 video_title = mobj.group(1).decode('utf-8')
539 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
540 video_title = video_title.replace(os.sep, u'%')
543 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
544 simple_title = simple_title.strip(ur'_')
549 'url': video_real_url,
550 'uploader': video_uploader,
551 'title': video_title,
552 'stitle': simple_title,
553 'ext': video_extension,
556 if __name__ == '__main__':
558 # Modules needed only when running the main program
562 # General configuration
563 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
564 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
565 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
568 parser = optparse.OptionParser(
569 usage='Usage: %prog [options] url...',
570 version='2008.07.22',
571 conflict_handler='resolve',
573 parser.add_option('-h', '--help',
574 action='help', help='print this help text and exit')
575 parser.add_option('-v', '--version',
576 action='version', help='print program version and exit')
577 parser.add_option('-u', '--username',
578 dest='username', metavar='UN', help='account username')
579 parser.add_option('-p', '--password',
580 dest='password', metavar='PW', help='account password')
581 parser.add_option('-o', '--output',
582 dest='outtmpl', metavar='TPL', help='output filename template')
583 parser.add_option('-q', '--quiet',
584 action='store_true', dest='quiet', help='activates quiet mode', default=False)
585 parser.add_option('-s', '--simulate',
586 action='store_true', dest='simulate', help='do not download video', default=False)
587 parser.add_option('-t', '--title',
588 action='store_true', dest='usetitle', help='use title in file name', default=False)
589 parser.add_option('-l', '--literal',
590 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
591 parser.add_option('-n', '--netrc',
592 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
593 parser.add_option('-g', '--get-url',
594 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
595 parser.add_option('-e', '--get-title',
596 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
597 parser.add_option('-f', '--format',
598 dest='format', metavar='FMT', help='video format code')
599 parser.add_option('-b', '--best-quality',
600 action='store_const', dest='format', help='alias for -f 18', const='18')
601 parser.add_option('-m', '--mobile-version',
602 action='store_const', dest='format', help='alias for -f 17', const='17')
603 parser.add_option('-i', '--ignore-errors',
604 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
605 parser.add_option('-r', '--rate-limit',
606 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
607 (opts, args) = parser.parse_args()
609 # Conflicting, missing and erroneous options
611 sys.exit('ERROR: you must provide at least one URL')
612 if opts.usenetrc and (opts.username is not None or opts.password is not None):
613 sys.exit('ERROR: using .netrc conflicts with giving username/password')
614 if opts.password is not None and opts.username is None:
615 sys.exit('ERROR: account username missing')
616 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
617 sys.exit('ERROR: using output template conflicts with using title or literal title')
618 if opts.usetitle and opts.useliteral:
619 sys.exit('ERROR: using title conflicts with using literal title')
620 if opts.username is not None and opts.password is None:
621 opts.password = getpass.getpass('Type account password and press return:')
622 if opts.ratelimit is not None:
623 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
624 if numeric_limit is None:
625 sys.exit('ERROR: invalid rate limit specified')
626 opts.ratelimit = numeric_limit
628 # Information extractors
629 youtube_ie = YoutubeIE()
632 fd = FileDownloader({
633 'usenetrc': opts.usenetrc,
634 'username': opts.username,
635 'password': opts.password,
636 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
637 'forceurl': opts.geturl,
638 'forcetitle': opts.gettitle,
639 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
640 'format': opts.format,
641 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
642 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
643 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
644 or '%(id)s.%(ext)s'),
645 'ignoreerrors': opts.ignoreerrors,
646 'ratelimit': opts.ratelimit,
648 fd.add_info_extractor(youtube_ie)
649 retcode = fd.download(args)
652 except DownloadError:
654 except SameFileError:
655 sys.exit('ERROR: fixed output name but more than one file to download')
656 except KeyboardInterrupt:
657 sys.exit('\nERROR: Interrupted by user')