2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
60 simulate: Do not download the video files.
61 format: Video format code.
62 outtmpl: Template for output names.
68 def __init__(self, params):
70 self.set_params(params)
74 """Create directory components in filename. Similar to Unix "mkdir -p"."""
75 components = filename.split(os.sep)
76 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
78 if not os.path.exists(dir):
82 def format_bytes(bytes):
88 exponent = long(math.log(float(bytes), 1024.0))
89 suffix = 'bkMGTPEZY'[exponent]
90 converted = float(bytes) / float(1024**exponent)
91 return '%.2f%s' % (converted, suffix)
94 def calc_percent(byte_counter, data_len):
97 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
100 def calc_eta(start, now, total, current):
104 if current == 0 or dif < 0.001: # One millisecond
106 rate = float(current) / dif
107 eta = long((float(total) - float(current)) / rate)
108 (eta_mins, eta_secs) = divmod(eta, 60)
111 return '%02d:%02d' % (eta_mins, eta_secs)
114 def calc_speed(start, now, bytes):
116 if bytes == 0 or dif < 0.001: # One millisecond
117 return '%10s' % '---b/s'
118 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
121 def best_block_size(elapsed_time, bytes):
122 new_min = max(bytes / 2.0, 1.0)
123 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
124 if elapsed_time < 0.001:
126 rate = bytes / elapsed_time
133 def set_params(self, params):
134 """Sets parameters."""
135 if type(params) != dict:
136 raise ValueError('params: dictionary expected')
137 self._params = params
139 def get_params(self):
140 """Get parameters."""
143 def add_info_extractor(self, ie):
144 """Add an InfoExtractor object to the end of the list."""
146 ie.set_downloader(self)
148 def to_stdout(self, message, skip_eol=False):
149 """Print message to stdout if not in quiet mode."""
150 if not self._params.get('quiet', False):
151 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
154 def to_stderr(self, message):
155 """Print message to stderr."""
156 sys.stderr.write('%s\n' % message)
158 def fixed_template(self):
159 """Checks if the output template is fixed."""
160 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
162 def download(self, url_list):
163 """Download a given list of URLs."""
165 if len(url_list) > 1 and self.fixed_template():
166 sys.exit('ERROR: fixed output name but more than one file to download')
169 suitable_found = False
171 if not ie.suitable(url):
173 # Suitable InfoExtractor found
174 suitable_found = True
175 all_results = ie.extract(url)
176 results = [x for x in all_results if x is not None]
177 if len(results) != len(all_results):
180 if len(results) > 1 and self.fixed_template():
181 sys.exit('ERROR: fixed output name but more than one file to download')
183 for result in results:
186 if self._params.get('forcetitle', False):
187 print result['title']
188 if self._params.get('forceurl', False):
191 # Do nothing else if in simulate mode
192 if self._params.get('simulate', False):
196 filename = self._params['outtmpl'] % result
197 except (ValueError, KeyError), err:
198 self.to_stderr('ERROR: invalid output template: %s' % str(err))
202 self.pmkdir(filename)
203 except (OSError, IOError), err:
204 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
208 outstream = open(filename, 'wb')
209 except (OSError, IOError), err:
210 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
214 self._do_download(outstream, result['url'])
216 except (OSError, IOError), err:
217 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
220 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
221 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
225 if not suitable_found:
226 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
231 def _do_download(self, stream, url):
232 request = urllib2.Request(url, None, std_headers)
233 data = urllib2.urlopen(request)
234 data_len = data.info().get('Content-length', None)
235 data_len_str = self.format_bytes(data_len)
240 percent_str = self.calc_percent(byte_counter, data_len)
241 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
242 speed_str = self.calc_speed(start, time.time(), byte_counter)
243 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
244 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
247 data_block = data.read(block_size)
249 data_block_len = len(data_block)
250 if data_block_len == 0:
252 byte_counter += data_block_len
253 stream.write(data_block)
254 block_size = self.best_block_size(after - before, data_block_len)
257 if data_len is not None and str(byte_counter) != data_len:
258 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
260 class InfoExtractor(object):
261 """Information Extractor class.
263 Information extractors are the classes that, given a URL, extract
264 information from the video (or videos) the URL refers to. This
265 information includes the real video URL, the video title and simplified
266 title, author and others. It is returned in a list of dictionaries when
267 calling its extract() method. It is a list because a URL can refer to
268 more than one video (think of playlists). The dictionaries must include
269 the following fields:
271 id: Video identifier.
272 url: Final video URL.
273 uploader: Nickname of the video uploader.
274 title: Literal title.
275 stitle: Simplified title.
276 ext: Video filename extension.
278 Subclasses of this one should re-define the _real_initialize() and
279 _real_extract() methods, as well as the suitable() static method.
280 Probably, they should also be instantiated and added to the main
287 def __init__(self, downloader=None):
288 """Constructor. Receives an optional downloader."""
290 self.set_downloader(downloader)
294 """Receives a URL and returns True if suitable for this IE."""
297 def initialize(self):
298 """Initializes an instance (login, etc)."""
300 self._real_initialize()
303 def extract(self, url):
304 """Extracts URL information and returns it in list of dicts."""
306 return self._real_extract(url)
308 def set_downloader(self, downloader):
309 """Sets the downloader for this IE."""
310 self._downloader = downloader
312 def to_stdout(self, message):
313 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
316 def to_stderr(self, message):
317 sys.stderr.write('%s\n' % message)
319 def _real_initialize(self):
320 """Real initialization process. Redefine in subclasses."""
323 def _real_extract(self, url):
324 """Real extraction process. Redefine in subclasses."""
327 class YoutubeIE(InfoExtractor):
328 """Information extractor for youtube.com."""
330 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
331 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
332 _NETRC_MACHINE = 'youtube'
334 def _real_initialize(self):
335 if self._downloader is None:
340 downloader_params = self._downloader.get_params()
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError), err:
355 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
363 'current_form': 'loginForm',
365 'action_login': 'Log In',
366 'username': username,
367 'password': password,
369 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
371 self.to_stdout('[youtube] Logging in')
372 login_results = urllib2.urlopen(request).read()
373 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
374 self.to_stderr('WARNING: unable to log in: bad username or password')
376 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
377 self.to_stderr('WARNING: unable to log in: %s' % str(err))
383 'action_confirm': 'Confirm',
385 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
387 self.to_stdout('[youtube] Confirming age')
388 age_results = urllib2.urlopen(request).read()
389 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
390 sys.exit('ERROR: unable to confirm age: %s' % str(err))
392 def _real_extract(self, url):
393 # Extract video id from URL
394 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
396 self.to_stderr('ERROR: invalid URL: %s' % url)
398 video_id = mobj.group(2)
400 # Downloader parameters
402 if self._downloader is not None:
403 params = self._downloader.get_params()
404 format_param = params.get('format', None)
407 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
409 # Normalize URL, including format
410 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
411 if format_param is not None:
412 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
413 request = urllib2.Request(normalized_url, None, std_headers)
415 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
416 video_webpage = urllib2.urlopen(request).read()
417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
418 sys.exit('ERROR: unable to download video: %s' % str(err))
419 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
422 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
424 self.to_stderr('ERROR: unable to extract "t" parameter')
426 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
427 if format_param is not None:
428 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
429 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
432 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
434 self.to_stderr('ERROR: unable to extract uploader nickname')
436 video_uploader = mobj.group(1)
439 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
441 self.to_stderr('ERROR: unable to extract video title')
443 video_title = mobj.group(1).decode('utf-8')
444 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
445 video_title = video_title.replace(os.sep, u'%')
448 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
449 simple_title = simple_title.strip(ur'_')
454 'url': video_real_url,
455 'uploader': video_uploader,
456 'title': video_title,
457 'stitle': simple_title,
458 'ext': video_extension,
461 if __name__ == '__main__':
463 # Modules needed only when running the main program
467 # General configuration
468 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
469 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
470 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
473 parser = optparse.OptionParser(
474 usage='Usage: %prog [options] url...',
476 conflict_handler='resolve',
478 parser.add_option('-h', '--help',
479 action='help', help='print this help text and exit')
480 parser.add_option('-v', '--version',
481 action='version', help='print program version and exit')
482 parser.add_option('-u', '--username',
483 dest='username', metavar='UN', help='account username')
484 parser.add_option('-p', '--password',
485 dest='password', metavar='PW', help='account password')
486 parser.add_option('-o', '--output',
487 dest='outtmpl', metavar='TPL', help='output filename template')
488 parser.add_option('-q', '--quiet',
489 action='store_true', dest='quiet', help='activates quiet mode', default=False)
490 parser.add_option('-s', '--simulate',
491 action='store_true', dest='simulate', help='do not download video', default=False)
492 parser.add_option('-t', '--title',
493 action='store_true', dest='usetitle', help='use title in file name', default=False)
494 parser.add_option('-l', '--literal',
495 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
496 parser.add_option('-n', '--netrc',
497 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
498 parser.add_option('-g', '--get-url',
499 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
500 parser.add_option('-e', '--get-title',
501 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
502 parser.add_option('-f', '--format',
503 dest='format', metavar='FMT', help='video format code')
504 parser.add_option('-b', '--best-quality',
505 action='store_const', dest='video_format', help='alias for -f 18', const='18')
506 (opts, args) = parser.parse_args()
508 # Conflicting, missing and erroneous options
510 sys.exit('ERROR: you must provide at least one URL')
511 if opts.usenetrc and (opts.username is not None or opts.password is not None):
512 sys.exit('ERROR: using .netrc conflicts with giving username/password')
513 if opts.password is not None and opts.username is None:
514 sys.exit('ERROR: account username missing')
515 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
516 sys.exit('ERROR: using output template conflicts with using title or literal title')
517 if opts.usetitle and opts.useliteral:
518 sys.exit('ERROR: using title conflicts with using literal title')
519 if opts.username is not None and opts.password is None:
520 opts.password = getpass.getpass('Type account password and press return:')
522 # Information extractors
523 youtube_ie = YoutubeIE()
526 fd = FileDownloader({
527 'usenetrc': opts.usenetrc,
528 'username': opts.username,
529 'password': opts.password,
530 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
531 'forceurl': opts.geturl,
532 'forcetitle': opts.gettitle,
533 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
534 'format': opts.format,
535 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
536 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
537 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
538 or '%(id)s.%(ext)s'),
540 fd.add_info_extractor(youtube_ie)
541 retcode = fd.download(args)
544 except KeyboardInterrupt:
545 sys.exit('\nERROR: Interrupted by user')