2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
60 simulate: Do not download the video files.
61 format: Video format code.
62 outtmpl: Template for output names.
63 ignoreerrors: Do not stop on download errors.
69 def __init__(self, params):
71 self.set_params(params)
75 """Create directory components in filename. Similar to Unix "mkdir -p"."""
76 components = filename.split(os.sep)
77 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
79 if not os.path.exists(dir):
83 def format_bytes(bytes):
89 exponent = long(math.log(float(bytes), 1024.0))
90 suffix = 'bkMGTPEZY'[exponent]
91 converted = float(bytes) / float(1024**exponent)
92 return '%.2f%s' % (converted, suffix)
95 def calc_percent(byte_counter, data_len):
98 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
101 def calc_eta(start, now, total, current):
105 if current == 0 or dif < 0.001: # One millisecond
107 rate = float(current) / dif
108 eta = long((float(total) - float(current)) / rate)
109 (eta_mins, eta_secs) = divmod(eta, 60)
112 return '%02d:%02d' % (eta_mins, eta_secs)
115 def calc_speed(start, now, bytes):
117 if bytes == 0 or dif < 0.001: # One millisecond
118 return '%10s' % '---b/s'
119 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
122 def best_block_size(elapsed_time, bytes):
123 new_min = max(bytes / 2.0, 1.0)
124 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
125 if elapsed_time < 0.001:
127 rate = bytes / elapsed_time
134 def set_params(self, params):
135 """Sets parameters."""
136 if type(params) != dict:
137 raise ValueError('params: dictionary expected')
138 self._params = params
140 def get_params(self):
141 """Get parameters."""
144 def add_info_extractor(self, ie):
145 """Add an InfoExtractor object to the end of the list."""
147 ie.set_downloader(self)
149 def to_stdout(self, message, skip_eol=False):
150 """Print message to stdout if not in quiet mode."""
151 if not self._params.get('quiet', False):
152 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
155 def to_stderr(self, message):
156 """Print message to stderr."""
157 sys.stderr.write('%s\n' % message)
159 def fixed_template(self):
160 """Checks if the output template is fixed."""
161 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
163 def trouble(self, message=None):
164 """Determine action to take when a download problem appears.
166 Depending on if the downloader has been configured to ignore
167 download errors or not, this method may exit the program or
168 not when errors are found, after printing the message. If it
169 doesn't exit, it returns an error code suitable to be returned
170 later as a program exit code to indicate error.
172 if message is not None:
173 self.to_stderr(message)
174 if not self._params.get('ignoreerrors', False):
178 def download(self, url_list):
179 """Download a given list of URLs."""
181 if len(url_list) > 1 and self.fixed_template():
182 sys.exit('ERROR: fixed output name but more than one file to download')
185 suitable_found = False
187 if not ie.suitable(url):
189 # Suitable InfoExtractor found
190 suitable_found = True
191 all_results = ie.extract(url)
192 results = [x for x in all_results if x is not None]
193 if len(results) != len(all_results):
194 retcode = self.trouble()
196 if len(results) > 1 and self.fixed_template():
197 sys.exit('ERROR: fixed output name but more than one file to download')
199 for result in results:
202 if self._params.get('forcetitle', False):
203 print result['title']
204 if self._params.get('forceurl', False):
207 # Do nothing else if in simulate mode
208 if self._params.get('simulate', False):
212 filename = self._params['outtmpl'] % result
213 except (ValueError, KeyError), err:
214 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
217 self.pmkdir(filename)
218 except (OSError, IOError), err:
219 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
222 outstream = open(filename, 'wb')
223 except (OSError, IOError), err:
224 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
227 self._do_download(outstream, result['url'])
229 except (OSError, IOError), err:
230 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
233 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
236 if not suitable_found:
237 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
241 def _do_download(self, stream, url):
242 request = urllib2.Request(url, None, std_headers)
243 data = urllib2.urlopen(request)
244 data_len = data.info().get('Content-length', None)
245 data_len_str = self.format_bytes(data_len)
250 percent_str = self.calc_percent(byte_counter, data_len)
251 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
252 speed_str = self.calc_speed(start, time.time(), byte_counter)
253 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
254 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
257 data_block = data.read(block_size)
259 data_block_len = len(data_block)
260 if data_block_len == 0:
262 byte_counter += data_block_len
263 stream.write(data_block)
264 block_size = self.best_block_size(after - before, data_block_len)
267 if data_len is not None and str(byte_counter) != data_len:
268 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
270 class InfoExtractor(object):
271 """Information Extractor class.
273 Information extractors are the classes that, given a URL, extract
274 information from the video (or videos) the URL refers to. This
275 information includes the real video URL, the video title and simplified
276 title, author and others. It is returned in a list of dictionaries when
277 calling its extract() method. It is a list because a URL can refer to
278 more than one video (think of playlists). The dictionaries must include
279 the following fields:
281 id: Video identifier.
282 url: Final video URL.
283 uploader: Nickname of the video uploader.
284 title: Literal title.
285 stitle: Simplified title.
286 ext: Video filename extension.
288 Subclasses of this one should re-define the _real_initialize() and
289 _real_extract() methods, as well as the suitable() static method.
290 Probably, they should also be instantiated and added to the main
297 def __init__(self, downloader=None):
298 """Constructor. Receives an optional downloader."""
300 self.set_downloader(downloader)
304 """Receives a URL and returns True if suitable for this IE."""
307 def initialize(self):
308 """Initializes an instance (login, etc)."""
310 self._real_initialize()
313 def extract(self, url):
314 """Extracts URL information and returns it in list of dicts."""
316 return self._real_extract(url)
318 def set_downloader(self, downloader):
319 """Sets the downloader for this IE."""
320 self._downloader = downloader
322 def to_stdout(self, message):
323 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
326 def to_stderr(self, message):
327 sys.stderr.write('%s\n' % message)
329 def _real_initialize(self):
330 """Real initialization process. Redefine in subclasses."""
333 def _real_extract(self, url):
334 """Real extraction process. Redefine in subclasses."""
337 class YoutubeIE(InfoExtractor):
338 """Information extractor for youtube.com."""
340 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
341 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
342 _NETRC_MACHINE = 'youtube'
344 def _real_initialize(self):
345 if self._downloader is None:
350 downloader_params = self._downloader.get_params()
352 # Attempt to use provided username and password or .netrc data
353 if downloader_params.get('username', None) is not None:
354 username = downloader_params['username']
355 password = downloader_params['password']
356 elif downloader_params.get('usenetrc', False):
358 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
363 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
364 except (IOError, netrc.NetrcParseError), err:
365 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
373 'current_form': 'loginForm',
375 'action_login': 'Log In',
376 'username': username,
377 'password': password,
379 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
381 self.to_stdout('[youtube] Logging in')
382 login_results = urllib2.urlopen(request).read()
383 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
384 self.to_stderr('WARNING: unable to log in: bad username or password')
386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
387 self.to_stderr('WARNING: unable to log in: %s' % str(err))
393 'action_confirm': 'Confirm',
395 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
397 self.to_stdout('[youtube] Confirming age')
398 age_results = urllib2.urlopen(request).read()
399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
400 sys.exit('ERROR: unable to confirm age: %s' % str(err))
402 def _real_extract(self, url):
403 # Extract video id from URL
404 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
406 self.to_stderr('ERROR: invalid URL: %s' % url)
408 video_id = mobj.group(2)
410 # Downloader parameters
412 if self._downloader is not None:
413 params = self._downloader.get_params()
414 format_param = params.get('format', None)
417 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
419 # Normalize URL, including format
420 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
421 if format_param is not None:
422 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
423 request = urllib2.Request(normalized_url, None, std_headers)
425 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
426 video_webpage = urllib2.urlopen(request).read()
427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
428 sys.exit('ERROR: unable to download video: %s' % str(err))
429 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
432 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
434 self.to_stderr('ERROR: unable to extract "t" parameter')
436 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
437 if format_param is not None:
438 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
439 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
442 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
444 self.to_stderr('ERROR: unable to extract uploader nickname')
446 video_uploader = mobj.group(1)
449 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
451 self.to_stderr('ERROR: unable to extract video title')
453 video_title = mobj.group(1).decode('utf-8')
454 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
455 video_title = video_title.replace(os.sep, u'%')
458 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
459 simple_title = simple_title.strip(ur'_')
464 'url': video_real_url,
465 'uploader': video_uploader,
466 'title': video_title,
467 'stitle': simple_title,
468 'ext': video_extension,
471 if __name__ == '__main__':
473 # Modules needed only when running the main program
477 # General configuration
478 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
479 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
480 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
483 parser = optparse.OptionParser(
484 usage='Usage: %prog [options] url...',
486 conflict_handler='resolve',
488 parser.add_option('-h', '--help',
489 action='help', help='print this help text and exit')
490 parser.add_option('-v', '--version',
491 action='version', help='print program version and exit')
492 parser.add_option('-u', '--username',
493 dest='username', metavar='UN', help='account username')
494 parser.add_option('-p', '--password',
495 dest='password', metavar='PW', help='account password')
496 parser.add_option('-o', '--output',
497 dest='outtmpl', metavar='TPL', help='output filename template')
498 parser.add_option('-q', '--quiet',
499 action='store_true', dest='quiet', help='activates quiet mode', default=False)
500 parser.add_option('-s', '--simulate',
501 action='store_true', dest='simulate', help='do not download video', default=False)
502 parser.add_option('-t', '--title',
503 action='store_true', dest='usetitle', help='use title in file name', default=False)
504 parser.add_option('-l', '--literal',
505 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
506 parser.add_option('-n', '--netrc',
507 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
508 parser.add_option('-g', '--get-url',
509 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
510 parser.add_option('-e', '--get-title',
511 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
512 parser.add_option('-f', '--format',
513 dest='format', metavar='FMT', help='video format code')
514 parser.add_option('-b', '--best-quality',
515 action='store_const', dest='video_format', help='alias for -f 18', const='18')
516 parser.add_option('-i', '--ignore-errors',
517 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
518 (opts, args) = parser.parse_args()
520 # Conflicting, missing and erroneous options
522 sys.exit('ERROR: you must provide at least one URL')
523 if opts.usenetrc and (opts.username is not None or opts.password is not None):
524 sys.exit('ERROR: using .netrc conflicts with giving username/password')
525 if opts.password is not None and opts.username is None:
526 sys.exit('ERROR: account username missing')
527 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
528 sys.exit('ERROR: using output template conflicts with using title or literal title')
529 if opts.usetitle and opts.useliteral:
530 sys.exit('ERROR: using title conflicts with using literal title')
531 if opts.username is not None and opts.password is None:
532 opts.password = getpass.getpass('Type account password and press return:')
534 # Information extractors
535 youtube_ie = YoutubeIE()
538 fd = FileDownloader({
539 'usenetrc': opts.usenetrc,
540 'username': opts.username,
541 'password': opts.password,
542 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
543 'forceurl': opts.geturl,
544 'forcetitle': opts.gettitle,
545 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
546 'format': opts.format,
547 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
548 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
549 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
550 or '%(id)s.%(ext)s'),
551 'ignoreerrors': opts.ignoreerrors,
553 fd.add_info_extractor(youtube_ie)
554 retcode = fd.download(args)
557 except KeyboardInterrupt:
558 sys.exit('\nERROR: Interrupted by user')