2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 format: Video format code.
59 outtmpl: Template for output names.
65 def __init__(self, params):
67 self.set_params(params)
71 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72 components = filename.split(os.sep)
73 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75 if not os.path.exists(dir):
79 def format_bytes(bytes):
85 exponent = long(math.log(float(bytes), 1024.0))
86 suffix = 'bkMGTPEZY'[exponent]
87 converted = float(bytes) / float(1024**exponent)
88 return '%.2f%s' % (converted, suffix)
91 def calc_percent(byte_counter, data_len):
94 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
97 def calc_eta(start, now, total, current):
101 if current == 0 or dif < 0.001: # One millisecond
103 rate = float(current) / dif
104 eta = long((float(total) - float(current)) / rate)
105 (eta_mins, eta_secs) = divmod(eta, 60)
108 return '%02d:%02d' % (eta_mins, eta_secs)
111 def calc_speed(start, now, bytes):
113 if bytes == 0 or dif < 0.001: # One millisecond
114 return '%10s' % '---b/s'
115 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
118 def best_block_size(elapsed_time, bytes):
119 new_min = max(bytes / 2.0, 1.0)
120 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
121 if elapsed_time < 0.001:
123 rate = bytes / elapsed_time
130 def set_params(self, params):
131 """Sets parameters."""
132 if type(params) != dict:
133 raise ValueError('params: dictionary expected')
134 self._params = params
136 def get_params(self):
137 """Get parameters."""
140 def add_info_extractor(self, ie):
141 """Add an InfoExtractor object to the end of the list."""
143 ie.set_downloader(self)
145 def to_stdout(self, message, skip_eol=False):
146 """Print message to stdout if not in quiet mode."""
147 if not self._params.get('quiet', False):
148 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
151 def download(self, url_list):
152 """Download a given list of URLs."""
154 suitable_found = False
156 if not ie.suitable(url):
158 # Suitable InfoExtractor found
159 suitable_found = True
160 results = [x for x in ie.extract(url) if x is not None]
162 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
163 sys.exit('ERROR: fixed output name but more than one file to download')
165 for result in results:
167 filename = self._params['outtmpl'] % result
168 except (KeyError), err:
169 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
172 self.pmkdir(filename)
173 except (OSError, IOError), err:
174 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
177 outstream = open(filename, 'wb')
178 except (OSError, IOError), err:
179 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
182 self._do_download(outstream, result['url'])
184 except (OSError, IOError), err:
185 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
187 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
188 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
191 if not suitable_found:
192 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
194 def _do_download(self, stream, url):
195 request = urllib2.Request(url, None, std_headers)
196 data = urllib2.urlopen(request)
197 data_len = data.info().get('Content-length', None)
198 data_len_str = self.format_bytes(data_len)
203 percent_str = self.calc_percent(byte_counter, data_len)
204 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
205 speed_str = self.calc_speed(start, time.time(), byte_counter)
206 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
207 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
210 data_block = data.read(block_size)
212 data_block_len = len(data_block)
213 if data_block_len == 0:
215 byte_counter += data_block_len
216 stream.write(data_block)
217 block_size = self.best_block_size(after - before, data_block_len)
220 if data_len is not None and str(byte_counter) != data_len:
221 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
223 class InfoExtractor(object):
224 """Information Extractor class.
226 Information extractors are the classes that, given a URL, extract
227 information from the video (or videos) the URL refers to. This
228 information includes the real video URL, the video title and simplified
229 title, author and others. It is returned in a list of dictionaries when
230 calling its extract() method. It is a list because a URL can refer to
231 more than one video (think of playlists). The dictionaries must include
232 the following fields:
234 id: Video identifier.
235 url: Final video URL.
236 uploader: Nickname of the video uploader.
237 title: Literal title.
238 stitle: Simplified title.
239 ext: Video filename extension.
241 Subclasses of this one should re-define the _real_initialize() and
242 _real_extract() methods, as well as the suitable() static method.
243 Probably, they should also be instantiated and added to the main
250 def __init__(self, downloader=None):
251 """Constructor. Receives an optional downloader."""
253 self.set_downloader(downloader)
257 """Receives a URL and returns True if suitable for this IE."""
260 def initialize(self):
261 """Initializes an instance (login, etc)."""
263 self._real_initialize()
266 def extract(self, url):
267 """Extracts URL information and returns it in list of dicts."""
269 return self._real_extract(url)
271 def set_downloader(self, downloader):
272 """Sets the downloader for this IE."""
273 self._downloader = downloader
275 def to_stdout(self, message):
276 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
279 def to_stderr(self, message):
280 sys.stderr.write('%s\n' % message)
282 def _real_initialize(self):
283 """Real initialization process. Redefine in subclasses."""
286 def _real_extract(self, url):
287 """Real extraction process. Redefine in subclasses."""
290 class YoutubeIE(InfoExtractor):
291 """Information extractor for youtube.com."""
293 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
294 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
295 _NETRC_MACHINE = 'youtube'
297 def _real_initialize(self):
298 if self._downloader is None:
303 downloader_params = self._downloader.get_params()
305 # Attempt to use provided username and password or .netrc data
306 if downloader_params.get('username', None) is not None:
307 username = downloader_params['username']
308 password = downloader_params['password']
309 elif downloader_params.get('usenetrc', False):
311 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
316 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
317 except (IOError, netrc.NetrcParseError), err:
318 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
326 'current_form': 'loginForm',
328 'action_login': 'Log In',
329 'username': username,
330 'password': password,
332 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
334 self.to_stdout('[youtube] Logging in')
335 login_results = urllib2.urlopen(request).read()
336 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
337 self.to_stderr('WARNING: Unable to log in: bad username or password')
339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
340 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
346 'action_confirm': 'Confirm',
348 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
350 self.to_stdout('[youtube] Confirming age')
351 age_results = urllib2.urlopen(request).read()
352 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
353 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
355 def _real_extract(self, url):
356 # Extract video id from URL
357 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
359 self.to_stderr('ERROR: Invalid URL: %s' % url)
361 video_id = mobj.group(2)
363 # Downloader parameters
365 if self._downloader is not None:
366 params = self._downloader.get_params()
367 format_param = params.get('format', None)
370 video_extension = {18: 'mp4'}.get(format_param, 'flv')
372 # Normalize URL, including format
373 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
374 if format_param is not None:
375 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
376 request = urllib2.Request(normalized_url, None, std_headers)
378 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
379 video_webpage = urllib2.urlopen(request).read()
380 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
381 sys.exit('ERROR: Unable to download video: %s' % str(err))
382 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
385 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
387 self.to_stderr('ERROR: Unable to extract "t" parameter')
389 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
390 if format_param is not None:
391 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
392 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
395 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
397 self.to_stderr('ERROR: Unable to extract uploader nickname')
399 video_uploader = mobj.group(1)
402 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
404 self.to_stderr('ERROR: Unable to extract video title')
406 video_title = mobj.group(1).decode('utf-8')
407 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
410 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
411 simple_title = simple_title.strip(u'_')
416 'url': video_real_url,
417 'uploader': video_uploader,
418 'title': video_title,
419 'stitle': simple_title,
420 'ext': video_extension,
423 if __name__ == '__main__':
425 # General configuration
426 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
427 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
429 # Information extractors
430 youtube_ie = YoutubeIE()
433 fd = FileDownloader({
439 'outtmpl': '%(id)s.%(ext)s'
441 fd.add_info_extractor(youtube_ie)
443 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
444 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
445 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
448 except KeyboardInterrupt:
449 sys.exit('\nERROR: Interrupted by user')