2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class DownloadError(Exception):
29 """Download Error exception.
31 This exception may be thrown by FileDownloader objects if they are not
32 configured to continue on errors. They will contain the appropriate
37 class SameFileError(Exception):
38 """Same File exception.
40 This exception will be thrown by FileDownloader objects if they detect
41 multiple files would have to be downloaded to the same file on disk.
45 class FileDownloader(object):
46 """File Downloader class.
48 File downloader objects are the ones responsible of downloading the
49 actual video file and writing it to disk if the user has requested
50 it, among some other tasks. In most cases there should be one per
51 program. As, given a video URL, the downloader doesn't know how to
52 extract all the needed information, task that InfoExtractors do, it
53 has to pass the URL to one of them.
55 For this, file downloader objects have a method that allows
56 InfoExtractors to be registered in a given order. When it is passed
57 a URL, the file downloader handles it to the first InfoExtractor it
58 finds that reports being able to handle it. The InfoExtractor returns
59 all the information to the FileDownloader and the latter downloads the
60 file or does whatever it's instructed to do.
62 File downloaders accept a lot of parameters. In order not to saturate
63 the object constructor with arguments, it receives a dictionary of
64 options instead. These options are available through the get_params()
65 method for the InfoExtractors to use. The FileDownloader also registers
66 itself as the downloader in charge for the InfoExtractors that are
67 added to it, so this is a "mutual registration".
71 username: Username for authentication purposes.
72 password: Password for authentication purposes.
73 usenetrc: Use netrc for authentication instead.
74 quiet: Do not print messages to stdout.
75 forceurl: Force printing final URL.
76 forcetitle: Force printing title.
77 simulate: Do not download the video files.
78 format: Video format code.
79 outtmpl: Template for output names.
80 ignoreerrors: Do not stop on download errors.
86 def __init__(self, params):
87 """Create a FileDownloader object with the given options."""
89 self.set_params(params)
93 """Create directory components in filename. Similar to Unix "mkdir -p"."""
94 components = filename.split(os.sep)
95 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
97 if not os.path.exists(dir):
101 def format_bytes(bytes):
107 exponent = long(math.log(float(bytes), 1024.0))
108 suffix = 'bkMGTPEZY'[exponent]
109 converted = float(bytes) / float(1024**exponent)
110 return '%.2f%s' % (converted, suffix)
113 def calc_percent(byte_counter, data_len):
116 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
119 def calc_eta(start, now, total, current):
123 if current == 0 or dif < 0.001: # One millisecond
125 rate = float(current) / dif
126 eta = long((float(total) - float(current)) / rate)
127 (eta_mins, eta_secs) = divmod(eta, 60)
130 return '%02d:%02d' % (eta_mins, eta_secs)
133 def calc_speed(start, now, bytes):
135 if bytes == 0 or dif < 0.001: # One millisecond
136 return '%10s' % '---b/s'
137 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
140 def best_block_size(elapsed_time, bytes):
141 new_min = max(bytes / 2.0, 1.0)
142 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
143 if elapsed_time < 0.001:
145 rate = bytes / elapsed_time
152 def set_params(self, params):
153 """Sets parameters."""
154 if type(params) != dict:
155 raise ValueError('params: dictionary expected')
156 self._params = params
158 def get_params(self):
159 """Get parameters."""
162 def add_info_extractor(self, ie):
163 """Add an InfoExtractor object to the end of the list."""
165 ie.set_downloader(self)
167 def to_stdout(self, message, skip_eol=False):
168 """Print message to stdout if not in quiet mode."""
169 if not self._params.get('quiet', False):
170 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
173 def to_stderr(self, message):
174 """Print message to stderr."""
175 sys.stderr.write('%s\n' % message)
177 def fixed_template(self):
178 """Checks if the output template is fixed."""
179 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
181 def trouble(self, message=None):
182 """Determine action to take when a download problem appears.
184 Depending on if the downloader has been configured to ignore
185 download errors or not, this method may throw an exception or
186 not when errors are found, after printing the message. If it
187 doesn't raise, it returns an error code suitable to be returned
188 later as a program exit code to indicate error.
190 if message is not None:
191 self.to_stderr(message)
192 if not self._params.get('ignoreerrors', False):
193 raise DownloadError(message)
196 def download(self, url_list):
197 """Download a given list of URLs."""
199 if len(url_list) > 1 and self.fixed_template():
200 raise SameFileError(self._params['outtmpl'])
203 suitable_found = False
205 if not ie.suitable(url):
207 # Suitable InfoExtractor found
208 suitable_found = True
209 all_results = ie.extract(url)
210 results = [x for x in all_results if x is not None]
211 if len(results) != len(all_results):
212 retcode = self.trouble()
214 if len(results) > 1 and self.fixed_template():
215 raise SameFileError(self._params['outtmpl'])
217 for result in results:
220 if self._params.get('forcetitle', False):
221 print result['title']
222 if self._params.get('forceurl', False):
225 # Do nothing else if in simulate mode
226 if self._params.get('simulate', False):
230 filename = self._params['outtmpl'] % result
231 except (ValueError, KeyError), err:
232 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
235 self.pmkdir(filename)
236 except (OSError, IOError), err:
237 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
240 outstream = open(filename, 'wb')
241 except (OSError, IOError), err:
242 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
245 self._do_download(outstream, result['url'])
247 except (OSError, IOError), err:
248 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
251 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
254 if not suitable_found:
255 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
259 def _do_download(self, stream, url):
260 request = urllib2.Request(url, None, std_headers)
261 data = urllib2.urlopen(request)
262 data_len = data.info().get('Content-length', None)
263 data_len_str = self.format_bytes(data_len)
268 percent_str = self.calc_percent(byte_counter, data_len)
269 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
270 speed_str = self.calc_speed(start, time.time(), byte_counter)
271 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
272 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
275 data_block = data.read(block_size)
277 data_block_len = len(data_block)
278 if data_block_len == 0:
280 byte_counter += data_block_len
281 stream.write(data_block)
282 block_size = self.best_block_size(after - before, data_block_len)
285 if data_len is not None and str(byte_counter) != data_len:
286 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
288 class InfoExtractor(object):
289 """Information Extractor class.
291 Information extractors are the classes that, given a URL, extract
292 information from the video (or videos) the URL refers to. This
293 information includes the real video URL, the video title and simplified
294 title, author and others. It is returned in a list of dictionaries when
295 calling its extract() method. It is a list because a URL can refer to
296 more than one video (think of playlists). The dictionaries must include
297 the following fields:
299 id: Video identifier.
300 url: Final video URL.
301 uploader: Nickname of the video uploader.
302 title: Literal title.
303 stitle: Simplified title.
304 ext: Video filename extension.
306 Subclasses of this one should re-define the _real_initialize() and
307 _real_extract() methods, as well as the suitable() static method.
308 Probably, they should also be instantiated and added to the main
315 def __init__(self, downloader=None):
316 """Constructor. Receives an optional downloader."""
318 self.set_downloader(downloader)
322 """Receives a URL and returns True if suitable for this IE."""
325 def initialize(self):
326 """Initializes an instance (authentication, etc)."""
328 self._real_initialize()
331 def extract(self, url):
332 """Extracts URL information and returns it in list of dicts."""
334 return self._real_extract(url)
336 def set_downloader(self, downloader):
337 """Sets the downloader for this IE."""
338 self._downloader = downloader
340 def to_stdout(self, message):
341 """Print message to stdout if downloader is not in quiet mode."""
342 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
345 def to_stderr(self, message):
346 """Print message to stderr."""
347 sys.stderr.write('%s\n' % message)
349 def _real_initialize(self):
350 """Real initialization process. Redefine in subclasses."""
353 def _real_extract(self, url):
354 """Real extraction process. Redefine in subclasses."""
357 class YoutubeIE(InfoExtractor):
358 """Information extractor for youtube.com."""
360 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
361 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
362 _NETRC_MACHINE = 'youtube'
364 def _real_initialize(self):
365 if self._downloader is None:
370 downloader_params = self._downloader.get_params()
372 # Attempt to use provided username and password or .netrc data
373 if downloader_params.get('username', None) is not None:
374 username = downloader_params['username']
375 password = downloader_params['password']
376 elif downloader_params.get('usenetrc', False):
378 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
383 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
384 except (IOError, netrc.NetrcParseError), err:
385 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
388 # No authentication to be performed
394 'current_form': 'loginForm',
396 'action_login': 'Log In',
397 'username': username,
398 'password': password,
400 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
402 self.to_stdout('[youtube] Logging in')
403 login_results = urllib2.urlopen(request).read()
404 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
405 self.to_stderr('WARNING: unable to log in: bad username or password')
407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
408 self.to_stderr('WARNING: unable to log in: %s' % str(err))
414 'action_confirm': 'Confirm',
416 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
418 self.to_stdout('[youtube] Confirming age')
419 age_results = urllib2.urlopen(request).read()
420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
421 self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
424 def _real_extract(self, url):
425 # Extract video id from URL
426 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
428 self.to_stderr('ERROR: invalid URL: %s' % url)
430 video_id = mobj.group(2)
432 # Downloader parameters
434 if self._downloader is not None:
435 params = self._downloader.get_params()
436 format_param = params.get('format', None)
439 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
441 # Normalize URL, including format
442 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
443 if format_param is not None:
444 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
445 request = urllib2.Request(normalized_url, None, std_headers)
447 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
448 video_webpage = urllib2.urlopen(request).read()
449 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
450 self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
452 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
455 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
457 self.to_stderr('ERROR: unable to extract "t" parameter')
459 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
460 if format_param is not None:
461 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
462 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
465 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
467 self.to_stderr('ERROR: unable to extract uploader nickname')
469 video_uploader = mobj.group(1)
472 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
474 self.to_stderr('ERROR: unable to extract video title')
476 video_title = mobj.group(1).decode('utf-8')
477 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
478 video_title = video_title.replace(os.sep, u'%')
481 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
482 simple_title = simple_title.strip(ur'_')
487 'url': video_real_url,
488 'uploader': video_uploader,
489 'title': video_title,
490 'stitle': simple_title,
491 'ext': video_extension,
494 if __name__ == '__main__':
496 # Modules needed only when running the main program
500 # General configuration
501 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
502 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
503 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
506 parser = optparse.OptionParser(
507 usage='Usage: %prog [options] url...',
509 conflict_handler='resolve',
511 parser.add_option('-h', '--help',
512 action='help', help='print this help text and exit')
513 parser.add_option('-v', '--version',
514 action='version', help='print program version and exit')
515 parser.add_option('-u', '--username',
516 dest='username', metavar='UN', help='account username')
517 parser.add_option('-p', '--password',
518 dest='password', metavar='PW', help='account password')
519 parser.add_option('-o', '--output',
520 dest='outtmpl', metavar='TPL', help='output filename template')
521 parser.add_option('-q', '--quiet',
522 action='store_true', dest='quiet', help='activates quiet mode', default=False)
523 parser.add_option('-s', '--simulate',
524 action='store_true', dest='simulate', help='do not download video', default=False)
525 parser.add_option('-t', '--title',
526 action='store_true', dest='usetitle', help='use title in file name', default=False)
527 parser.add_option('-l', '--literal',
528 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
529 parser.add_option('-n', '--netrc',
530 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
531 parser.add_option('-g', '--get-url',
532 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
533 parser.add_option('-e', '--get-title',
534 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
535 parser.add_option('-f', '--format',
536 dest='format', metavar='FMT', help='video format code')
537 parser.add_option('-b', '--best-quality',
538 action='store_const', dest='video_format', help='alias for -f 18', const='18')
539 parser.add_option('-i', '--ignore-errors',
540 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
541 (opts, args) = parser.parse_args()
543 # Conflicting, missing and erroneous options
545 sys.exit('ERROR: you must provide at least one URL')
546 if opts.usenetrc and (opts.username is not None or opts.password is not None):
547 sys.exit('ERROR: using .netrc conflicts with giving username/password')
548 if opts.password is not None and opts.username is None:
549 sys.exit('ERROR: account username missing')
550 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
551 sys.exit('ERROR: using output template conflicts with using title or literal title')
552 if opts.usetitle and opts.useliteral:
553 sys.exit('ERROR: using title conflicts with using literal title')
554 if opts.username is not None and opts.password is None:
555 opts.password = getpass.getpass('Type account password and press return:')
557 # Information extractors
558 youtube_ie = YoutubeIE()
561 fd = FileDownloader({
562 'usenetrc': opts.usenetrc,
563 'username': opts.username,
564 'password': opts.password,
565 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
566 'forceurl': opts.geturl,
567 'forcetitle': opts.gettitle,
568 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
569 'format': opts.format,
570 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
571 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
572 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
573 or '%(id)s.%(ext)s'),
574 'ignoreerrors': opts.ignoreerrors,
576 fd.add_info_extractor(youtube_ie)
577 retcode = fd.download(args)
580 except DownloadError:
582 except SameFileError:
583 sys.exit('ERROR: fixed output name but more than one file to download')
584 except KeyboardInterrupt:
585 sys.exit('\nERROR: Interrupted by user')