2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 simulate: Do not download the video files.
59 format: Video format code.
60 outtmpl: Template for output names.
66 def __init__(self, params):
68 self.set_params(params)
72 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73 components = filename.split(os.sep)
74 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
76 if not os.path.exists(dir):
80 def format_bytes(bytes):
86 exponent = long(math.log(float(bytes), 1024.0))
87 suffix = 'bkMGTPEZY'[exponent]
88 converted = float(bytes) / float(1024**exponent)
89 return '%.2f%s' % (converted, suffix)
92 def calc_percent(byte_counter, data_len):
95 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
98 def calc_eta(start, now, total, current):
102 if current == 0 or dif < 0.001: # One millisecond
104 rate = float(current) / dif
105 eta = long((float(total) - float(current)) / rate)
106 (eta_mins, eta_secs) = divmod(eta, 60)
109 return '%02d:%02d' % (eta_mins, eta_secs)
112 def calc_speed(start, now, bytes):
114 if bytes == 0 or dif < 0.001: # One millisecond
115 return '%10s' % '---b/s'
116 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
119 def best_block_size(elapsed_time, bytes):
120 new_min = max(bytes / 2.0, 1.0)
121 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122 if elapsed_time < 0.001:
124 rate = bytes / elapsed_time
131 def set_params(self, params):
132 """Sets parameters."""
133 if type(params) != dict:
134 raise ValueError('params: dictionary expected')
135 self._params = params
137 def get_params(self):
138 """Get parameters."""
141 def add_info_extractor(self, ie):
142 """Add an InfoExtractor object to the end of the list."""
144 ie.set_downloader(self)
146 def to_stdout(self, message, skip_eol=False):
147 """Print message to stdout if not in quiet mode."""
148 if not self._params.get('quiet', False):
149 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
152 def to_stderr(self, message):
153 """Print message to stderr."""
154 sys.stderr.write('%s\n' % message)
156 def download(self, url_list):
157 """Download a given list of URLs."""
159 suitable_found = False
161 if not ie.suitable(url):
163 # Suitable InfoExtractor found
164 suitable_found = True
165 results = [x for x in ie.extract(url) if x is not None]
167 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
168 sys.exit('ERROR: fixed output name but more than one file to download')
170 if self._params.get('simulate', False):
173 for result in results:
175 filename = self._params['outtmpl'] % result
176 except (KeyError), err:
177 self.to_stderr('ERROR: invalid output template: %s' % str(err))
180 self.pmkdir(filename)
181 except (OSError, IOError), err:
182 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
185 outstream = open(filename, 'wb')
186 except (OSError, IOError), err:
187 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
190 self._do_download(outstream, result['url'])
192 except (OSError, IOError), err:
193 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
196 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
199 if not suitable_found:
200 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
202 def _do_download(self, stream, url):
203 request = urllib2.Request(url, None, std_headers)
204 data = urllib2.urlopen(request)
205 data_len = data.info().get('Content-length', None)
206 data_len_str = self.format_bytes(data_len)
211 percent_str = self.calc_percent(byte_counter, data_len)
212 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
213 speed_str = self.calc_speed(start, time.time(), byte_counter)
214 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
215 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
218 data_block = data.read(block_size)
220 data_block_len = len(data_block)
221 if data_block_len == 0:
223 byte_counter += data_block_len
224 stream.write(data_block)
225 block_size = self.best_block_size(after - before, data_block_len)
228 if data_len is not None and str(byte_counter) != data_len:
229 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
231 class InfoExtractor(object):
232 """Information Extractor class.
234 Information extractors are the classes that, given a URL, extract
235 information from the video (or videos) the URL refers to. This
236 information includes the real video URL, the video title and simplified
237 title, author and others. It is returned in a list of dictionaries when
238 calling its extract() method. It is a list because a URL can refer to
239 more than one video (think of playlists). The dictionaries must include
240 the following fields:
242 id: Video identifier.
243 url: Final video URL.
244 uploader: Nickname of the video uploader.
245 title: Literal title.
246 stitle: Simplified title.
247 ext: Video filename extension.
249 Subclasses of this one should re-define the _real_initialize() and
250 _real_extract() methods, as well as the suitable() static method.
251 Probably, they should also be instantiated and added to the main
258 def __init__(self, downloader=None):
259 """Constructor. Receives an optional downloader."""
261 self.set_downloader(downloader)
265 """Receives a URL and returns True if suitable for this IE."""
268 def initialize(self):
269 """Initializes an instance (login, etc)."""
271 self._real_initialize()
274 def extract(self, url):
275 """Extracts URL information and returns it in list of dicts."""
277 return self._real_extract(url)
279 def set_downloader(self, downloader):
280 """Sets the downloader for this IE."""
281 self._downloader = downloader
283 def to_stdout(self, message):
284 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
287 def to_stderr(self, message):
288 sys.stderr.write('%s\n' % message)
290 def _real_initialize(self):
291 """Real initialization process. Redefine in subclasses."""
294 def _real_extract(self, url):
295 """Real extraction process. Redefine in subclasses."""
298 class YoutubeIE(InfoExtractor):
299 """Information extractor for youtube.com."""
301 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
302 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
303 _NETRC_MACHINE = 'youtube'
305 def _real_initialize(self):
306 if self._downloader is None:
311 downloader_params = self._downloader.get_params()
313 # Attempt to use provided username and password or .netrc data
314 if downloader_params.get('username', None) is not None:
315 username = downloader_params['username']
316 password = downloader_params['password']
317 elif downloader_params.get('usenetrc', False):
319 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
324 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
325 except (IOError, netrc.NetrcParseError), err:
326 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
334 'current_form': 'loginForm',
336 'action_login': 'Log In',
337 'username': username,
338 'password': password,
340 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
342 self.to_stdout('[youtube] Logging in')
343 login_results = urllib2.urlopen(request).read()
344 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
345 self.to_stderr('WARNING: Unable to log in: bad username or password')
347 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
348 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
354 'action_confirm': 'Confirm',
356 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
358 self.to_stdout('[youtube] Confirming age')
359 age_results = urllib2.urlopen(request).read()
360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
361 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
363 def _real_extract(self, url):
364 # Extract video id from URL
365 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
367 self.to_stderr('ERROR: Invalid URL: %s' % url)
369 video_id = mobj.group(2)
371 # Downloader parameters
373 if self._downloader is not None:
374 params = self._downloader.get_params()
375 format_param = params.get('format', None)
378 video_extension = {18: 'mp4'}.get(format_param, 'flv')
380 # Normalize URL, including format
381 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
382 if format_param is not None:
383 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
384 request = urllib2.Request(normalized_url, None, std_headers)
386 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
387 video_webpage = urllib2.urlopen(request).read()
388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
389 sys.exit('ERROR: Unable to download video: %s' % str(err))
390 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
393 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
395 self.to_stderr('ERROR: Unable to extract "t" parameter')
397 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
398 if format_param is not None:
399 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
400 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
403 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
405 self.to_stderr('ERROR: Unable to extract uploader nickname')
407 video_uploader = mobj.group(1)
410 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
412 self.to_stderr('ERROR: Unable to extract video title')
414 video_title = mobj.group(1).decode('utf-8')
415 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
418 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
419 simple_title = simple_title.strip(u'_')
424 'url': video_real_url,
425 'uploader': video_uploader,
426 'title': video_title,
427 'stitle': simple_title,
428 'ext': video_extension,
431 if __name__ == '__main__':
433 # General configuration
434 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
435 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
437 # Information extractors
438 youtube_ie = YoutubeIE()
441 fd = FileDownloader({
448 'outtmpl': '%(id)s.%(ext)s'
450 fd.add_info_extractor(youtube_ie)
452 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
453 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
454 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
457 except KeyboardInterrupt:
458 sys.exit('\nERROR: Interrupted by user')