2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 format: Video format code.
59 outtmpl: Template for output names.
65 def __init__(self, params):
67 self.set_params(params)
71 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72 components = filename.split(os.sep)
73 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75 if not os.path.exists(dir):
79 def format_bytes(bytes):
85 exponent = long(math.log(float(bytes), 1024.0))
86 suffix = 'bkMGTPEZY'[exponent]
88 return '%s%s' % (bytes, suffix)
89 converted = float(bytes) / float(1024**exponent)
90 return '%.2f%s' % (converted, suffix)
93 def calc_percent(byte_counter, data_len):
96 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
99 def calc_eta(start, now, total, current):
103 if current == 0 or dif < 0.001: # One millisecond
105 rate = float(current) / dif
106 eta = long((float(total) - float(current)) / rate)
107 (eta_mins, eta_secs) = divmod(eta, 60)
110 return '%02d:%02d' % (eta_mins, eta_secs)
113 def calc_speed(start, now, bytes):
115 if bytes == 0 or dif < 0.001: # One millisecond
116 return '%9s' % 'N/A b/s'
117 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
120 def best_block_size(elapsed_time, bytes):
121 new_min = max(bytes / 2.0, 1.0)
122 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
123 if elapsed_time < 0.001:
125 rate = bytes / elapsed_time
132 def set_params(self, params):
133 """Sets parameters."""
134 if type(params) != dict:
135 raise ValueError('params: dictionary expected')
136 self._params = params
138 def get_params(self):
139 """Get parameters."""
142 def add_info_extractor(self, ie):
143 """Add an InfoExtractor object to the end of the list."""
145 ie.set_downloader(self)
147 def download(self, url_list):
148 """Download a given list of URLs."""
150 suitable_found = False
152 if not ie.suitable(url):
154 # Suitable InfoExtractor found
155 suitable_found = True
156 results = [x for x in ie.extract(url) if x is not None]
158 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
159 sys.exit('ERROR: fixed output name but more than one file to download')
161 for result in results:
163 filename = self._params['outtmpl'] % result
164 except (KeyError), err:
165 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
168 self.pmkdir(filename)
169 except (OSError, IOError), err:
170 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
173 outstream = open(filename, 'wb')
174 except (OSError, IOError), err:
175 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
178 self._do_download(outstream, result['url'])
180 except (OSError, IOError), err:
181 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
183 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
184 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
187 if not suitable_found:
188 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
190 def _do_download(self, stream, url):
191 request = urllib2.Request(url, None, std_headers)
192 data = urllib2.urlopen(request)
193 data_len = data.info().get('Content-length', None)
194 data_len_str = self.format_bytes(data_len)
199 percent_str = self.calc_percent(byte_counter, data_len)
200 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
201 speed_str = self.calc_speed(start, time.time(), byte_counter)
203 if not self._params.get('quiet', False):
204 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
205 (percent_str, data_len_str, speed_str, eta_str))
209 data_block = data.read(block_size)
211 data_block_len = len(data_block)
212 if data_block_len == 0:
214 byte_counter += data_block_len
215 stream.write(data_block)
216 block_size = self.best_block_size(after - before, data_block_len)
218 if not self._params.get('quiet', False):
221 if data_len is not None and str(byte_counter) != data_len:
222 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
224 class InfoExtractor(object):
225 """Information Extractor class.
227 Information extractors are the classes that, given a URL, extract
228 information from the video (or videos) the URL refers to. This
229 information includes the real video URL, the video title and simplified
230 title, author and others. It is returned in a list of dictionaries when
231 calling its extract() method. It is a list because a URL can refer to
232 more than one video (think of playlists). The dictionaries must include
233 the following fields:
235 id: Video identifier.
236 url: Final video URL.
237 uploader: Nickname of the video uploader.
238 title: Literal title.
239 stitle: Simplified title.
240 ext: Video filename extension.
242 Subclasses of this one should re-define the _real_initialize() and
243 _real_extract() methods, as well as the suitable() static method.
244 Probably, they should also be instantiated and added to the main
251 def __init__(self, downloader=None):
252 """Constructor. Receives an optional downloader."""
254 self.set_downloader(downloader)
258 """Receives a URL and returns True if suitable for this IE."""
261 def initialize(self):
262 """Initializes an instance (login, etc)."""
264 self._real_initialize()
267 def extract(self, url):
268 """Extracts URL information and returns it in list of dicts."""
270 return self._real_extract(url)
272 def set_downloader(self, downloader):
273 """Sets the downloader for this IE."""
274 self._downloader = downloader
276 def to_stdout(self, message):
277 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
280 def to_stderr(self, message):
281 sys.stderr.write('%s\n' % message)
283 def _real_initialize(self):
284 """Real initialization process. Redefine in subclasses."""
287 def _real_extract(self, url):
288 """Real extraction process. Redefine in subclasses."""
291 class YoutubeIE(InfoExtractor):
292 """Information extractor for youtube.com."""
294 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
295 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
296 _NETRC_MACHINE = 'youtube'
298 def _real_initialize(self):
299 if self._downloader is None:
304 downloader_params = self._downloader.get_params()
306 # Attempt to use provided username and password or .netrc data
307 if downloader_params.get('username', None) is not None:
308 username = downloader_params['username']
309 password = downloader_params['password']
310 elif downloader_params.get('usenetrc', False):
312 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
317 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
318 except (IOError, netrc.NetrcParseError), err:
319 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
326 login_form = { 'current_form': 'loginForm',
328 'action_login': 'Log In',
329 'username': username,
330 'password': password, }
331 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
333 self.to_stdout('[youtube] Logging in')
334 login_results = urllib2.urlopen(request).read()
335 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
336 self.to_stderr('WARNING: Unable to log in: bad username or password')
338 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
339 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
343 age_form = { 'next_url': '/',
344 'action_confirm': 'Confirm', }
345 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
347 self.to_stdout('[youtube] Confirming age')
348 age_results = urllib2.urlopen(request).read()
349 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
350 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
352 def _real_extract(self, url):
353 # Extract video id from URL
354 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
356 self.to_stderr('ERROR: Invalid URL: %s' % url)
358 video_id = mobj.group(2)
360 # Downloader parameters
362 if self._downloader is not None:
363 params = self._downloader.get_params()
364 format_param = params.get('format', None)
367 video_extension = {18: 'mp4'}.get(format_param, 'flv')
369 # Normalize URL, including format
370 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
371 if format_param is not None:
372 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
373 request = urllib2.Request(normalized_url, None, std_headers)
375 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
376 video_webpage = urllib2.urlopen(request).read()
377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
378 sys.exit('ERROR: Unable to download video: %s' % str(err))
379 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
382 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
384 self.to_stderr('ERROR: Unable to extract "t" parameter')
386 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
387 if format_param is not None:
388 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
391 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
393 self.to_stderr('ERROR: Unable to extract uploader nickname')
395 video_uploader = mobj.group(1)
398 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
400 self.to_stderr('ERROR: Unable to extract video title')
402 video_title = mobj.group(1).decode('utf-8')
403 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
406 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
407 simple_title = simple_title.strip(u'_')
410 return [{ 'id': video_id,
411 'url': video_real_url,
412 'uploader': video_uploader,
413 'title': video_title,
414 'stitle': simple_title,
415 'ext': video_extension,
418 if __name__ == '__main__':
420 # General configuration
421 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
422 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
424 # Information extractors
425 youtube_ie = YoutubeIE()
428 fd = FileDownloader({ 'usenetrc': False,
433 'outtmpl': '%(id)s.%(ext)s'
435 fd.add_info_extractor(youtube_ie)
436 fd.download([ 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
437 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
438 'http://www.youtube.com/watch?v=DZRXe1wtC-M', ])
440 except KeyboardInterrupt:
441 sys.exit('\nERROR: Interrupted by user')