2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class DownloadError(Exception):
29 """Download Error exception.
31 This exception may be thrown by FileDownloader objects if they are not
32 configured to continue on errors. They will contain the appropriate
37 class SameFileError(Exception):
38 """Same File exception.
40 This exception will be thrown by FileDownloader objects if they detect
41 multiple files would have to be downloaded to the same file on disk.
45 class FileDownloader(object):
46 """File Downloader class.
48 File downloader objects are the ones responsible of downloading the
49 actual video file and writing it to disk if the user has requested
50 it, among some other tasks. In most cases there should be one per
51 program. As, given a video URL, the downloader doesn't know how to
52 extract all the needed information, task that InfoExtractors do, it
53 has to pass the URL to one of them.
55 For this, file downloader objects have a method that allows
56 InfoExtractors to be registered in a given order. When it is passed
57 a URL, the file downloader handles it to the first InfoExtractor it
58 finds that reports being able to handle it. The InfoExtractor returns
59 all the information to the FileDownloader and the latter downloads the
60 file or does whatever it's instructed to do.
62 File downloaders accept a lot of parameters. In order not to saturate
63 the object constructor with arguments, it receives a dictionary of
64 options instead. These options are available through the get_params()
65 method for the InfoExtractors to use. The FileDownloader also registers
66 itself as the downloader in charge for the InfoExtractors that are
67 added to it, so this is a "mutual registration".
71 username: Username for authentication purposes.
72 password: Password for authentication purposes.
73 usenetrc: Use netrc for authentication instead.
74 quiet: Do not print messages to stdout.
75 forceurl: Force printing final URL.
76 forcetitle: Force printing title.
77 simulate: Do not download the video files.
78 format: Video format code.
79 outtmpl: Template for output names.
80 ignoreerrors: Do not stop on download errors.
86 def __init__(self, params):
87 """Create a FileDownloader object with the given options."""
89 self.set_params(params)
93 """Create directory components in filename. Similar to Unix "mkdir -p"."""
94 components = filename.split(os.sep)
95 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
97 if not os.path.exists(dir):
101 def format_bytes(bytes):
107 exponent = long(math.log(float(bytes), 1024.0))
108 suffix = 'bkMGTPEZY'[exponent]
109 converted = float(bytes) / float(1024**exponent)
110 return '%.2f%s' % (converted, suffix)
113 def calc_percent(byte_counter, data_len):
116 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
119 def calc_eta(start, now, total, current):
123 if current == 0 or dif < 0.001: # One millisecond
125 rate = float(current) / dif
126 eta = long((float(total) - float(current)) / rate)
127 (eta_mins, eta_secs) = divmod(eta, 60)
130 return '%02d:%02d' % (eta_mins, eta_secs)
133 def calc_speed(start, now, bytes):
135 if bytes == 0 or dif < 0.001: # One millisecond
136 return '%10s' % '---b/s'
137 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
140 def best_block_size(elapsed_time, bytes):
141 new_min = max(bytes / 2.0, 1.0)
142 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
143 if elapsed_time < 0.001:
145 rate = bytes / elapsed_time
152 def set_params(self, params):
153 """Sets parameters."""
154 if type(params) != dict:
155 raise ValueError('params: dictionary expected')
156 self._params = params
158 def get_params(self):
159 """Get parameters."""
162 def add_info_extractor(self, ie):
163 """Add an InfoExtractor object to the end of the list."""
165 ie.set_downloader(self)
167 def to_stdout(self, message, skip_eol=False):
168 """Print message to stdout if not in quiet mode."""
169 if not self._params.get('quiet', False):
170 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
173 def to_stderr(self, message):
174 """Print message to stderr."""
175 sys.stderr.write('%s\n' % message)
177 def fixed_template(self):
178 """Checks if the output template is fixed."""
179 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
181 def trouble(self, message=None):
182 """Determine action to take when a download problem appears.
184 Depending on if the downloader has been configured to ignore
185 download errors or not, this method may throw an exception or
186 not when errors are found, after printing the message. If it
187 doesn't raise, it returns an error code suitable to be returned
188 later as a program exit code to indicate error.
190 if message is not None:
191 self.to_stderr(message)
192 if not self._params.get('ignoreerrors', False):
193 raise DownloadError(message)
196 def download(self, url_list):
197 """Download a given list of URLs."""
199 if len(url_list) > 1 and self.fixed_template():
200 raise SameFileError(self._params['outtmpl'])
203 suitable_found = False
205 if not ie.suitable(url):
207 # Suitable InfoExtractor found
208 suitable_found = True
209 all_results = ie.extract(url)
210 results = [x for x in all_results if x is not None]
211 if len(results) != len(all_results):
212 retcode = self.trouble()
214 if len(results) > 1 and self.fixed_template():
215 raise SameFileError(self._params['outtmpl'])
217 for result in results:
220 if self._params.get('forcetitle', False):
221 print result['title']
222 if self._params.get('forceurl', False):
225 # Do nothing else if in simulate mode
226 if self._params.get('simulate', False):
230 filename = self._params['outtmpl'] % result
231 self.to_stdout('[download] Destination: %s' % filename)
232 except (ValueError, KeyError), err:
233 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
236 self.pmkdir(filename)
237 except (OSError, IOError), err:
238 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
241 outstream = open(filename, 'wb')
242 except (OSError, IOError), err:
243 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
246 self._do_download(outstream, result['url'])
248 except (OSError, IOError), err:
249 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
252 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
255 if not suitable_found:
256 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
260 def _do_download(self, stream, url):
261 request = urllib2.Request(url, None, std_headers)
262 data = urllib2.urlopen(request)
263 data_len = data.info().get('Content-length', None)
264 data_len_str = self.format_bytes(data_len)
269 percent_str = self.calc_percent(byte_counter, data_len)
270 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
271 speed_str = self.calc_speed(start, time.time(), byte_counter)
272 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
273 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
276 data_block = data.read(block_size)
278 data_block_len = len(data_block)
279 if data_block_len == 0:
281 byte_counter += data_block_len
282 stream.write(data_block)
283 block_size = self.best_block_size(after - before, data_block_len)
286 if data_len is not None and str(byte_counter) != data_len:
287 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
289 class InfoExtractor(object):
290 """Information Extractor class.
292 Information extractors are the classes that, given a URL, extract
293 information from the video (or videos) the URL refers to. This
294 information includes the real video URL, the video title and simplified
295 title, author and others. It is returned in a list of dictionaries when
296 calling its extract() method. It is a list because a URL can refer to
297 more than one video (think of playlists). The dictionaries must include
298 the following fields:
300 id: Video identifier.
301 url: Final video URL.
302 uploader: Nickname of the video uploader.
303 title: Literal title.
304 stitle: Simplified title.
305 ext: Video filename extension.
307 Subclasses of this one should re-define the _real_initialize() and
308 _real_extract() methods, as well as the suitable() static method.
309 Probably, they should also be instantiated and added to the main
316 def __init__(self, downloader=None):
317 """Constructor. Receives an optional downloader."""
319 self.set_downloader(downloader)
323 """Receives a URL and returns True if suitable for this IE."""
326 def initialize(self):
327 """Initializes an instance (authentication, etc)."""
329 self._real_initialize()
332 def extract(self, url):
333 """Extracts URL information and returns it in list of dicts."""
335 return self._real_extract(url)
337 def set_downloader(self, downloader):
338 """Sets the downloader for this IE."""
339 self._downloader = downloader
341 def to_stdout(self, message):
342 """Print message to stdout if downloader is not in quiet mode."""
343 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
346 def to_stderr(self, message):
347 """Print message to stderr."""
348 sys.stderr.write('%s\n' % message)
350 def _real_initialize(self):
351 """Real initialization process. Redefine in subclasses."""
354 def _real_extract(self, url):
355 """Real extraction process. Redefine in subclasses."""
358 class YoutubeIE(InfoExtractor):
359 """Information extractor for youtube.com."""
361 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
362 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
363 _NETRC_MACHINE = 'youtube'
365 def _real_initialize(self):
366 if self._downloader is None:
371 downloader_params = self._downloader.get_params()
373 # Attempt to use provided username and password or .netrc data
374 if downloader_params.get('username', None) is not None:
375 username = downloader_params['username']
376 password = downloader_params['password']
377 elif downloader_params.get('usenetrc', False):
379 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385 except (IOError, netrc.NetrcParseError), err:
386 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
389 # No authentication to be performed
395 'current_form': 'loginForm',
397 'action_login': 'Log In',
398 'username': username,
399 'password': password,
401 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
403 self.to_stdout('[youtube] Logging in')
404 login_results = urllib2.urlopen(request).read()
405 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
406 self.to_stderr('WARNING: unable to log in: bad username or password')
408 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
409 self.to_stderr('WARNING: unable to log in: %s' % str(err))
415 'action_confirm': 'Confirm',
417 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
419 self.to_stdout('[youtube] Confirming age')
420 age_results = urllib2.urlopen(request).read()
421 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
422 self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
425 def _real_extract(self, url):
426 # Extract video id from URL
427 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
429 self.to_stderr('ERROR: invalid URL: %s' % url)
431 video_id = mobj.group(2)
433 # Downloader parameters
435 if self._downloader is not None:
436 params = self._downloader.get_params()
437 format_param = params.get('format', None)
440 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
442 # Normalize URL, including format
443 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
444 if format_param is not None:
445 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
446 request = urllib2.Request(normalized_url, None, std_headers)
448 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
449 video_webpage = urllib2.urlopen(request).read()
450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
451 self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
453 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
456 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
458 self.to_stderr('ERROR: unable to extract "t" parameter')
460 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
461 if format_param is not None:
462 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
463 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
466 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
468 self.to_stderr('ERROR: unable to extract uploader nickname')
470 video_uploader = mobj.group(1)
473 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
475 self.to_stderr('ERROR: unable to extract video title')
477 video_title = mobj.group(1).decode('utf-8')
478 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
479 video_title = video_title.replace(os.sep, u'%')
482 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
483 simple_title = simple_title.strip(ur'_')
488 'url': video_real_url,
489 'uploader': video_uploader,
490 'title': video_title,
491 'stitle': simple_title,
492 'ext': video_extension,
495 if __name__ == '__main__':
497 # Modules needed only when running the main program
501 # General configuration
502 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
503 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
504 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
507 parser = optparse.OptionParser(
508 usage='Usage: %prog [options] url...',
510 conflict_handler='resolve',
512 parser.add_option('-h', '--help',
513 action='help', help='print this help text and exit')
514 parser.add_option('-v', '--version',
515 action='version', help='print program version and exit')
516 parser.add_option('-u', '--username',
517 dest='username', metavar='UN', help='account username')
518 parser.add_option('-p', '--password',
519 dest='password', metavar='PW', help='account password')
520 parser.add_option('-o', '--output',
521 dest='outtmpl', metavar='TPL', help='output filename template')
522 parser.add_option('-q', '--quiet',
523 action='store_true', dest='quiet', help='activates quiet mode', default=False)
524 parser.add_option('-s', '--simulate',
525 action='store_true', dest='simulate', help='do not download video', default=False)
526 parser.add_option('-t', '--title',
527 action='store_true', dest='usetitle', help='use title in file name', default=False)
528 parser.add_option('-l', '--literal',
529 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
530 parser.add_option('-n', '--netrc',
531 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
532 parser.add_option('-g', '--get-url',
533 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
534 parser.add_option('-e', '--get-title',
535 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
536 parser.add_option('-f', '--format',
537 dest='format', metavar='FMT', help='video format code')
538 parser.add_option('-b', '--best-quality',
539 action='store_const', dest='video_format', help='alias for -f 18', const='18')
540 parser.add_option('-m', '--mobile-version',
541 action='store_const', dest='video_format', help='alias for -f 17', const='17')
542 parser.add_option('-i', '--ignore-errors',
543 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
544 (opts, args) = parser.parse_args()
546 # Conflicting, missing and erroneous options
548 sys.exit('ERROR: you must provide at least one URL')
549 if opts.usenetrc and (opts.username is not None or opts.password is not None):
550 sys.exit('ERROR: using .netrc conflicts with giving username/password')
551 if opts.password is not None and opts.username is None:
552 sys.exit('ERROR: account username missing')
553 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
554 sys.exit('ERROR: using output template conflicts with using title or literal title')
555 if opts.usetitle and opts.useliteral:
556 sys.exit('ERROR: using title conflicts with using literal title')
557 if opts.username is not None and opts.password is None:
558 opts.password = getpass.getpass('Type account password and press return:')
560 # Information extractors
561 youtube_ie = YoutubeIE()
564 fd = FileDownloader({
565 'usenetrc': opts.usenetrc,
566 'username': opts.username,
567 'password': opts.password,
568 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
569 'forceurl': opts.geturl,
570 'forcetitle': opts.gettitle,
571 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
572 'format': opts.format,
573 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
574 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
575 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
576 or '%(id)s.%(ext)s'),
577 'ignoreerrors': opts.ignoreerrors,
579 fd.add_info_extractor(youtube_ie)
580 retcode = fd.download(args)
583 except DownloadError:
585 except SameFileError:
586 sys.exit('ERROR: fixed output name but more than one file to download')
587 except KeyboardInterrupt:
588 sys.exit('\nERROR: Interrupted by user')