]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube-dl
Put Danny Colligan as an author in the script itself
[youtube-dl.git] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = { 
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class FileDownloader(object):
56         """File Downloader class.
57
58         File downloader objects are the ones responsible of downloading the
59         actual video file and writing it to disk if the user has requested
60         it, among some other tasks. In most cases there should be one per
61         program. As, given a video URL, the downloader doesn't know how to
62         extract all the needed information, task that InfoExtractors do, it
63         has to pass the URL to one of them.
64
65         For this, file downloader objects have a method that allows
66         InfoExtractors to be registered in a given order. When it is passed
67         a URL, the file downloader handles it to the first InfoExtractor it
68         finds that reports being able to handle it. The InfoExtractor returns
69         all the information to the FileDownloader and the latter downloads the
70         file or does whatever it's instructed to do.
71
72         File downloaders accept a lot of parameters. In order not to saturate
73         the object constructor with arguments, it receives a dictionary of
74         options instead. These options are available through the get_params()
75         method for the InfoExtractors to use. The FileDownloader also registers
76         itself as the downloader in charge for the InfoExtractors that are
77         added to it, so this is a "mutual registration".
78
79         Available options:
80
81         username:       Username for authentication purposes.
82         password:       Password for authentication purposes.
83         usenetrc:       Use netrc for authentication instead.
84         quiet:          Do not print messages to stdout.
85         forceurl:       Force printing final URL.
86         forcetitle:     Force printing title.
87         simulate:       Do not download the video files.
88         format:         Video format code.
89         outtmpl:        Template for output names.
90         ignoreerrors:   Do not stop on download errors.
91         ratelimit:      Download speed limit, in bytes/sec.
92         """
93
94         _params = None
95         _ies = []
96         _pps = []
97
98         def __init__(self, params):
99                 """Create a FileDownloader object with the given options."""
100                 self._ies = []
101                 self._pps = []
102                 self.set_params(params)
103         
104         @staticmethod
105         def pmkdir(filename):
106                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
107                 components = filename.split(os.sep)
108                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
109                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
110                 for dir in aggregate:
111                         if not os.path.exists(dir):
112                                 os.mkdir(dir)
113         
114         @staticmethod
115         def format_bytes(bytes):
116                 if bytes is None:
117                         return 'N/A'
118                 if bytes == 0:
119                         exponent = 0
120                 else:
121                         exponent = long(math.log(float(bytes), 1024.0))
122                 suffix = 'bkMGTPEZY'[exponent]
123                 converted = float(bytes) / float(1024**exponent)
124                 return '%.2f%s' % (converted, suffix)
125
126         @staticmethod
127         def calc_percent(byte_counter, data_len):
128                 if data_len is None:
129                         return '---.-%'
130                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
131
132         @staticmethod
133         def calc_eta(start, now, total, current):
134                 if total is None:
135                         return '--:--'
136                 dif = now - start
137                 if current == 0 or dif < 0.001: # One millisecond
138                         return '--:--'
139                 rate = float(current) / dif
140                 eta = long((float(total) - float(current)) / rate)
141                 (eta_mins, eta_secs) = divmod(eta, 60)
142                 if eta_mins > 99:
143                         return '--:--'
144                 return '%02d:%02d' % (eta_mins, eta_secs)
145
146         @staticmethod
147         def calc_speed(start, now, bytes):
148                 dif = now - start
149                 if bytes == 0 or dif < 0.001: # One millisecond
150                         return '%10s' % '---b/s'
151                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
152
153         @staticmethod
154         def best_block_size(elapsed_time, bytes):
155                 new_min = max(bytes / 2.0, 1.0)
156                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
157                 if elapsed_time < 0.001:
158                         return int(new_max)
159                 rate = bytes / elapsed_time
160                 if rate > new_max:
161                         return int(new_max)
162                 if rate < new_min:
163                         return int(new_min)
164                 return int(rate)
165
166         @staticmethod
167         def parse_bytes(bytestr):
168                 """Parse a string indicating a byte quantity into a long integer."""
169                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
170                 if matchobj is None:
171                         return None
172                 number = float(matchobj.group(1))
173                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
174                 return long(round(number * multiplier))
175
176         def set_params(self, params):
177                 """Sets parameters."""
178                 if type(params) != dict:
179                         raise ValueError('params: dictionary expected')
180                 self._params = params
181         
182         def get_params(self):
183                 """Get parameters."""
184                 return self._params
185
186         def add_info_extractor(self, ie):
187                 """Add an InfoExtractor object to the end of the list."""
188                 self._ies.append(ie)
189                 ie.set_downloader(self)
190         
191         def add_post_processor(self, pp):
192                 """Add a PostProcessor object to the end of the chain."""
193                 self._pps.append(pp)
194                 pp.set_downloader(self)
195         
196         def to_stdout(self, message, skip_eol=False):
197                 """Print message to stdout if not in quiet mode."""
198                 if not self._params.get('quiet', False):
199                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
200                         sys.stdout.flush()
201         
202         def to_stderr(self, message):
203                 """Print message to stderr."""
204                 print >>sys.stderr, message
205         
206         def fixed_template(self):
207                 """Checks if the output template is fixed."""
208                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
209
210         def trouble(self, message=None):
211                 """Determine action to take when a download problem appears.
212
213                 Depending on if the downloader has been configured to ignore
214                 download errors or not, this method may throw an exception or
215                 not when errors are found, after printing the message. If it
216                 doesn't raise, it returns an error code suitable to be returned
217                 later as a program exit code to indicate error.
218                 """
219                 if message is not None:
220                         self.to_stderr(message)
221                 if not self._params.get('ignoreerrors', False):
222                         raise DownloadError(message)
223                 return 1
224
225         def slow_down(self, start_time, byte_counter):
226                 """Sleep if the download speed is over the rate limit."""
227                 rate_limit = self._params.get('ratelimit', None)
228                 if rate_limit is None or byte_counter == 0:
229                         return
230                 now = time.time()
231                 elapsed = now - start_time
232                 if elapsed <= 0.0:
233                         return
234                 speed = float(byte_counter) / elapsed
235                 if speed > rate_limit:
236                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
237
238         def report_destination(self, filename):
239                 """Report destination filename."""
240                 self.to_stdout(u'[download] Destination: %s' % filename)
241         
242         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
243                 """Report download progress."""
244                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
245                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
246         
247         def report_finish(self):
248                 """Report download finished."""
249                 self.to_stdout(u'')
250
251         def download(self, url_list):
252                 """Download a given list of URLs."""
253                 retcode = 0
254                 if len(url_list) > 1 and self.fixed_template():
255                         raise SameFileError(self._params['outtmpl'])
256
257                 for url in url_list:
258                         suitable_found = False
259                         for ie in self._ies:
260                                 if not ie.suitable(url):
261                                         continue
262                                 # Suitable InfoExtractor found
263                                 suitable_found = True
264                                 all_results = ie.extract(url)
265                                 results = [x for x in all_results if x is not None]
266                                 if len(results) != len(all_results):
267                                         retcode = self.trouble()
268
269                                 if len(results) > 1 and self.fixed_template():
270                                         raise SameFileError(self._params['outtmpl'])
271
272                                 for result in results:
273                                         # Forced printings
274                                         if self._params.get('forcetitle', False):
275                                                 print result['title']
276                                         if self._params.get('forceurl', False):
277                                                 print result['url']
278                                                 
279                                         # Do nothing else if in simulate mode
280                                         if self._params.get('simulate', False):
281                                                 continue
282
283                                         try:
284                                                 filename = self._params['outtmpl'] % result
285                                                 self.report_destination(filename)
286                                         except (ValueError, KeyError), err:
287                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
288                                                 continue
289                                         try:
290                                                 self.pmkdir(filename)
291                                         except (OSError, IOError), err:
292                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
293                                                 continue
294                                         try:
295                                                 outstream = open(filename, 'wb')
296                                         except (OSError, IOError), err:
297                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
298                                                 continue
299                                         try:
300                                                 self._do_download(outstream, result['url'])
301                                                 outstream.close()
302                                         except (OSError, IOError), err:
303                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
304                                                 continue
305                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
306                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
307                                                 continue
308                                         try:
309                                                 self.post_process(filename, result)
310                                         except (PostProcessingError), err:
311                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
312                                                 continue
313
314                                 break
315                         if not suitable_found:
316                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
317
318                 return retcode
319
320         def post_process(self, filename, ie_info):
321                 """Run the postprocessing chain on the given file."""
322                 info = dict(ie_info)
323                 info['filepath'] = filename
324                 for pp in self._pps:
325                         info = pp.run(info)
326                         if info is None:
327                                 break
328         
329         def _do_download(self, stream, url):
330                 request = urllib2.Request(url, None, std_headers)
331                 data = urllib2.urlopen(request)
332                 data_len = data.info().get('Content-length', None)
333                 data_len_str = self.format_bytes(data_len)
334                 byte_counter = 0
335                 block_size = 1024
336                 start = time.time()
337                 while True:
338                         # Progress message
339                         percent_str = self.calc_percent(byte_counter, data_len)
340                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
341                         speed_str = self.calc_speed(start, time.time(), byte_counter)
342                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
343
344                         # Download and write
345                         before = time.time()
346                         data_block = data.read(block_size)
347                         after = time.time()
348                         data_block_len = len(data_block)
349                         if data_block_len == 0:
350                                 break
351                         byte_counter += data_block_len
352                         stream.write(data_block)
353                         block_size = self.best_block_size(after - before, data_block_len)
354
355                         # Apply rate limit
356                         self.slow_down(start, byte_counter)
357
358                 self.report_finish()
359                 if data_len is not None and str(byte_counter) != data_len:
360                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
361
362 class InfoExtractor(object):
363         """Information Extractor class.
364
365         Information extractors are the classes that, given a URL, extract
366         information from the video (or videos) the URL refers to. This
367         information includes the real video URL, the video title and simplified
368         title, author and others. It is returned in a list of dictionaries when
369         calling its extract() method. It is a list because a URL can refer to
370         more than one video (think of playlists). The dictionaries must include
371         the following fields:
372
373         id:             Video identifier.
374         url:            Final video URL.
375         uploader:       Nickname of the video uploader.
376         title:          Literal title.
377         stitle:         Simplified title.
378         ext:            Video filename extension.
379
380         Subclasses of this one should re-define the _real_initialize() and
381         _real_extract() methods, as well as the suitable() static method.
382         Probably, they should also be instantiated and added to the main
383         downloader.
384         """
385
386         _ready = False
387         _downloader = None
388
389         def __init__(self, downloader=None):
390                 """Constructor. Receives an optional downloader."""
391                 self._ready = False
392                 self.set_downloader(downloader)
393
394         @staticmethod
395         def suitable(url):
396                 """Receives a URL and returns True if suitable for this IE."""
397                 return False
398
399         def initialize(self):
400                 """Initializes an instance (authentication, etc)."""
401                 if not self._ready:
402                         self._real_initialize()
403                         self._ready = True
404
405         def extract(self, url):
406                 """Extracts URL information and returns it in list of dicts."""
407                 self.initialize()
408                 return self._real_extract(url)
409
410         def set_downloader(self, downloader):
411                 """Sets the downloader for this IE."""
412                 self._downloader = downloader
413         
414         def to_stdout(self, message):
415                 """Print message to stdout if downloader is not in quiet mode."""
416                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
417                         print message
418         
419         def to_stderr(self, message):
420                 """Print message to stderr."""
421                 print >>sys.stderr, message
422
423         def _real_initialize(self):
424                 """Real initialization process. Redefine in subclasses."""
425                 pass
426
427         def _real_extract(self, url):
428                 """Real extraction process. Redefine in subclasses."""
429                 pass
430
431 class YoutubeIE(InfoExtractor):
432         """Information extractor for youtube.com."""
433
434         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
435         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
436         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
437         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
438         _NETRC_MACHINE = 'youtube'
439
440         @staticmethod
441         def suitable(url):
442                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
443
444         def report_lang(self):
445                 """Report attempt to set language."""
446                 self.to_stdout(u'[youtube] Setting language')
447
448         def report_login(self):
449                 """Report attempt to log in."""
450                 self.to_stdout(u'[youtube] Logging in')
451         
452         def report_age_confirmation(self):
453                 """Report attempt to confirm age."""
454                 self.to_stdout(u'[youtube] Confirming age')
455         
456         def report_webpage_download(self, video_id):
457                 """Report attempt to download webpage."""
458                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
459         
460         def report_information_extraction(self, video_id):
461                 """Report attempt to extract video information."""
462                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
463         
464         def report_video_url(self, video_id, video_real_url):
465                 """Report extracted video URL."""
466                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
467
468         def _real_initialize(self):
469                 if self._downloader is None:
470                         return
471
472                 username = None
473                 password = None
474                 downloader_params = self._downloader.get_params()
475
476                 # Attempt to use provided username and password or .netrc data
477                 if downloader_params.get('username', None) is not None:
478                         username = downloader_params['username']
479                         password = downloader_params['password']
480                 elif downloader_params.get('usenetrc', False):
481                         try:
482                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
483                                 if info is not None:
484                                         username = info[0]
485                                         password = info[2]
486                                 else:
487                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
488                         except (IOError, netrc.NetrcParseError), err:
489                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
490                                 return
491
492                 # No authentication to be performed
493                 if username is None:
494                         return
495
496                 # Set language
497                 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
498                 try:
499                         self.report_lang()
500                         urllib2.urlopen(request).read()
501                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
502                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
503                         return
504
505                 # Log in
506                 login_form = {
507                                 'current_form': 'loginForm',
508                                 'next':         '/',
509                                 'action_login': 'Log In',
510                                 'username':     username,
511                                 'password':     password,
512                                 }
513                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
514                 try:
515                         self.report_login()
516                         login_results = urllib2.urlopen(request).read()
517                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
518                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
519                                 return
520                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
521                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
522                         return
523         
524                 # Confirm age
525                 age_form = {
526                                 'next_url':             '/',
527                                 'action_confirm':       'Confirm',
528                                 }
529                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
530                 try:
531                         self.report_age_confirmation()
532                         age_results = urllib2.urlopen(request).read()
533                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
535                         return
536
537         def _real_extract(self, url):
538                 # Extract video id from URL
539                 mobj = re.match(self._VALID_URL, url)
540                 if mobj is None:
541                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
542                         return [None]
543                 video_id = mobj.group(2)
544
545                 # Downloader parameters
546                 format_param = None
547                 if self._downloader is not None:
548                         params = self._downloader.get_params()
549                         format_param = params.get('format', None)
550
551                 # Extension
552                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
553
554                 # Normalize URL, including format
555                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
556                 if format_param is not None:
557                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
558                 request = urllib2.Request(normalized_url, None, std_headers)
559                 try:
560                         self.report_webpage_download(video_id)
561                         video_webpage = urllib2.urlopen(request).read()
562                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
563                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
564                         return [None]
565                 self.report_information_extraction(video_id)
566                 
567                 # "t" param
568                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
569                 if mobj is None:
570                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
571                         return [None]
572                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
573                 if format_param is not None:
574                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
575                 self.report_video_url(video_id, video_real_url)
576
577                 # uploader
578                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
579                 if mobj is None:
580                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
581                         return [None]
582                 video_uploader = mobj.group(1)
583
584                 # title
585                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
586                 if mobj is None:
587                         self.to_stderr(u'ERROR: unable to extract video title')
588                         return [None]
589                 video_title = mobj.group(1).decode('utf-8')
590                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
591                 video_title = video_title.replace(os.sep, u'%')
592
593                 # simplified title
594                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
595                 simple_title = simple_title.strip(ur'_')
596
597                 # Return information
598                 return [{
599                         'id':           video_id.decode('utf-8'),
600                         'url':          video_real_url.decode('utf-8'),
601                         'uploader':     video_uploader.decode('utf-8'),
602                         'title':        video_title,
603                         'stitle':       simple_title,
604                         'ext':          video_extension.decode('utf-8'),
605                         }]
606
607 class MetacafeIE(InfoExtractor):
608         """Information Extractor for metacafe.com."""
609
610         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
611         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
612         _youtube_ie = None
613
614         def __init__(self, youtube_ie, downloader=None):
615                 InfoExtractor.__init__(self, downloader)
616                 self._youtube_ie = youtube_ie
617
618         @staticmethod
619         def suitable(url):
620                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
621
622         def report_disclaimer(self):
623                 """Report disclaimer retrieval."""
624                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
625
626         def report_age_confirmation(self):
627                 """Report attempt to confirm age."""
628                 self.to_stdout(u'[metacafe] Confirming age')
629         
630         def report_download_webpage(self, video_id):
631                 """Report webpage download."""
632                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
633         
634         def report_extraction(self, video_id):
635                 """Report information extraction."""
636                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
637
638         def _real_initialize(self):
639                 # Retrieve disclaimer
640                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
641                 try:
642                         self.report_disclaimer()
643                         disclaimer = urllib2.urlopen(request).read()
644                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
645                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
646                         return
647
648                 # Confirm age
649                 disclaimer_form = {
650                         'filters': '0',
651                         'submit': "Continue - I'm over 18",
652                         }
653                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
654                 try:
655                         self.report_age_confirmation()
656                         disclaimer = urllib2.urlopen(request).read()
657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
658                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
659                         return
660         
661         def _real_extract(self, url):
662                 # Extract id and simplified title from URL
663                 mobj = re.match(self._VALID_URL, url)
664                 if mobj is None:
665                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
666                         return [None]
667
668                 video_id = mobj.group(1)
669
670                 # Check if video comes from YouTube
671                 mobj2 = re.match(r'^yt-(.*)$', video_id)
672                 if mobj2 is not None:
673                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
674
675                 simple_title = mobj.group(2).decode('utf-8')
676                 video_extension = 'flv'
677
678                 # Retrieve video webpage to extract further information
679                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
680                 try:
681                         self.report_download_webpage(video_id)
682                         webpage = urllib2.urlopen(request).read()
683                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
684                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
685                         return [None]
686
687                 # Extract URL, uploader and title from webpage
688                 self.report_extraction(video_id)
689                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
690                 if mobj is None:
691                         self.to_stderr(u'ERROR: unable to extract media URL')
692                         return [None]
693                 mediaURL = mobj.group(1).replace('\\', '')
694
695                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
696                 if mobj is None:
697                         self.to_stderr(u'ERROR: unable to extract gdaKey')
698                         return [None]
699                 gdaKey = mobj.group(1)
700
701                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
702
703                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
704                 if mobj is None:
705                         self.to_stderr(u'ERROR: unable to extract title')
706                         return [None]
707                 video_title = mobj.group(1).decode('utf-8')
708
709                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
710                 if mobj is None:
711                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
712                         return [None]
713                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
714
715                 # Return information
716                 return [{
717                         'id':           video_id.decode('utf-8'),
718                         'url':          video_url.decode('utf-8'),
719                         'uploader':     video_uploader.decode('utf-8'),
720                         'title':        video_title,
721                         'stitle':       simple_title,
722                         'ext':          video_extension.decode('utf-8'),
723                         }]
724
725
726 class YoutubeSearchIE(InfoExtractor):
727         """Information Extractor for YouTube search queries."""
728         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
729         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
730         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
731         _MORE_PAGES_INDICATOR = r'>Next</a>'
732         _youtube_ie = None
733
734         def __init__(self, youtube_ie, downloader=None): 
735                 InfoExtractor.__init__(self, downloader)
736                 self._youtube_ie = youtube_ie
737         
738         @staticmethod
739         def suitable(url):
740                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
741
742         def report_download_page(self, query, pagenum):
743                 """Report attempt to download playlist page with given number."""
744                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
745
746         def _real_initialize(self):
747                 self._youtube_ie.initialize()
748         
749         def _real_extract(self, query):
750                 mobj = re.match(self._VALID_QUERY, query)
751                 if mobj is None:
752                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
753                         return [None]
754
755                 prefix, query = query.split(':')
756                 prefix = prefix[8:]
757                 if prefix == '': 
758                         return self._download_n_results(query, 1)
759                 elif prefix == 'all': 
760                         return self._download_n_results(query, -1)
761                 else: 
762                         try:
763                                 n = int(prefix)
764                                 if n <= 0:
765                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
766                                         return [None]
767                                 return self._download_n_results(query, n)
768                         except ValueError: # parsing prefix as int fails
769                                 return self._download_n_results(query, 1)
770
771         def _download_n_results(self, query, n):
772                 """Downloads a specified number of results for a query"""
773
774                 video_ids = []
775                 already_seen = set()
776                 pagenum = 1
777
778                 while True:
779                         self.report_download_page(query, pagenum)
780                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
781                         request = urllib2.Request(result_url, None, std_headers)
782                         try:
783                                 page = urllib2.urlopen(request).read()
784                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
786                                 return [None]
787
788                         # Extract video identifiers
789                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
790                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
791                                 if video_id not in already_seen:
792                                         video_ids.append(video_id)
793                                         already_seen.add(video_id)
794                                         if len(video_ids) == n:
795                                                 # Specified n videos reached
796                                                 information = []
797                                                 for id in video_ids:
798                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
799                                                 return information
800
801                         if self._MORE_PAGES_INDICATOR not in page:
802                                 information = []
803                                 for id in video_ids:
804                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
805                                 return information
806
807                         pagenum = pagenum + 1
808
809 class YoutubePlaylistIE(InfoExtractor):
810         """Information Extractor for YouTube playlists."""
811
812         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
813         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
814         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
815         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
816         _youtube_ie = None
817
818         def __init__(self, youtube_ie, downloader=None):
819                 InfoExtractor.__init__(self, downloader)
820                 self._youtube_ie = youtube_ie
821         
822         @staticmethod
823         def suitable(url):
824                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
825
826         def report_download_page(self, playlist_id, pagenum):
827                 """Report attempt to download playlist page with given number."""
828                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
829
830         def _real_initialize(self):
831                 self._youtube_ie.initialize()
832         
833         def _real_extract(self, url):
834                 # Extract playlist id
835                 mobj = re.match(self._VALID_URL, url)
836                 if mobj is None:
837                         self.to_stderr(u'ERROR: invalid url: %s' % url)
838                         return [None]
839
840                 # Download playlist pages
841                 playlist_id = mobj.group(1)
842                 video_ids = []
843                 pagenum = 1
844
845                 while True:
846                         self.report_download_page(playlist_id, pagenum)
847                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
848                         try:
849                                 page = urllib2.urlopen(request).read()
850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
852                                 return [None]
853
854                         # Extract video identifiers
855                         ids_in_page = []
856                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
857                                 if mobj.group(1) not in ids_in_page:
858                                         ids_in_page.append(mobj.group(1))
859                         video_ids.extend(ids_in_page)
860
861                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
862                                 break
863                         pagenum = pagenum + 1
864
865                 information = []
866                 for id in video_ids:
867                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
868                 return information
869
870 class PostProcessor(object):
871         """Post Processor class.
872
873         PostProcessor objects can be added to downloaders with their
874         add_post_processor() method. When the downloader has finished a
875         successful download, it will take its internal chain of PostProcessors
876         and start calling the run() method on each one of them, first with
877         an initial argument and then with the returned value of the previous
878         PostProcessor.
879
880         The chain will be stopped if one of them ever returns None or the end
881         of the chain is reached.
882
883         PostProcessor objects follow a "mutual registration" process similar
884         to InfoExtractor objects.
885         """
886
887         _downloader = None
888
889         def __init__(self, downloader=None):
890                 self._downloader = downloader
891
892         def to_stdout(self, message):
893                 """Print message to stdout if downloader is not in quiet mode."""
894                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
895                         print message
896         
897         def to_stderr(self, message):
898                 """Print message to stderr."""
899                 print >>sys.stderr, message
900
901         def set_downloader(self, downloader):
902                 """Sets the downloader for this PP."""
903                 self._downloader = downloader
904         
905         def run(self, information):
906                 """Run the PostProcessor.
907
908                 The "information" argument is a dictionary like the ones
909                 returned by InfoExtractors. The only difference is that this
910                 one has an extra field called "filepath" that points to the
911                 downloaded file.
912
913                 When this method returns None, the postprocessing chain is
914                 stopped. However, this method may return an information
915                 dictionary that will be passed to the next postprocessing
916                 object in the chain. It can be the one it received after
917                 changing some fields.
918
919                 In addition, this method may raise a PostProcessingError
920                 exception that will be taken into account by the downloader
921                 it was called from.
922                 """
923                 return information # by default, do nothing
924         
925 ### MAIN PROGRAM ###
926 if __name__ == '__main__':
927         try:
928                 # Modules needed only when running the main program
929                 import getpass
930                 import optparse
931
932                 # General configuration
933                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
934                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
935                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
936
937                 # Parse command line
938                 parser = optparse.OptionParser(
939                                 usage='Usage: %prog [options] url...',
940                                 version='2009.01.31',
941                                 conflict_handler='resolve',
942                                 )
943                 parser.add_option('-h', '--help',
944                                 action='help', help='print this help text and exit')
945                 parser.add_option('-v', '--version',
946                                 action='version', help='print program version and exit')
947                 parser.add_option('-u', '--username',
948                                 dest='username', metavar='UN', help='account username')
949                 parser.add_option('-p', '--password',
950                                 dest='password', metavar='PW', help='account password')
951                 parser.add_option('-o', '--output',
952                                 dest='outtmpl', metavar='TPL', help='output filename template')
953                 parser.add_option('-q', '--quiet',
954                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
955                 parser.add_option('-s', '--simulate',
956                                 action='store_true', dest='simulate', help='do not download video', default=False)
957                 parser.add_option('-t', '--title',
958                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
959                 parser.add_option('-l', '--literal',
960                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
961                 parser.add_option('-n', '--netrc',
962                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
963                 parser.add_option('-g', '--get-url',
964                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
965                 parser.add_option('-e', '--get-title',
966                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
967                 parser.add_option('-f', '--format',
968                                 dest='format', metavar='FMT', help='video format code')
969                 parser.add_option('-b', '--best-quality',
970                                 action='store_const', dest='format', help='alias for -f 18', const='18')
971                 parser.add_option('-m', '--mobile-version',
972                                 action='store_const', dest='format', help='alias for -f 17', const='17')
973                 parser.add_option('-i', '--ignore-errors',
974                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
975                 parser.add_option('-r', '--rate-limit',
976                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
977                 parser.add_option('-a', '--batch-file',
978                                 dest='batchfile', metavar='F', help='file containing URLs to download')
979                 (opts, args) = parser.parse_args()
980
981                 # Batch file verification
982                 batchurls = []
983                 if opts.batchfile is not None:
984                         try:
985                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
986                         except IOError:
987                                 sys.exit(u'ERROR: batch file could not be read')
988                 all_urls = batchurls + args
989
990                 # Conflicting, missing and erroneous options
991                 if len(all_urls) < 1:
992                         sys.exit(u'ERROR: you must provide at least one URL')
993                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
994                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
995                 if opts.password is not None and opts.username is None:
996                         sys.exit(u'ERROR: account username missing')
997                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
998                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
999                 if opts.usetitle and opts.useliteral:
1000                         sys.exit(u'ERROR: using title conflicts with using literal title')
1001                 if opts.username is not None and opts.password is None:
1002                         opts.password = getpass.getpass(u'Type account password and press return:')
1003                 if opts.ratelimit is not None:
1004                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1005                         if numeric_limit is None:
1006                                 sys.exit(u'ERROR: invalid rate limit specified')
1007                         opts.ratelimit = numeric_limit
1008
1009                 # Information extractors
1010                 youtube_ie = YoutubeIE()
1011                 metacafe_ie = MetacafeIE(youtube_ie)
1012                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1013                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1014
1015                 # File downloader
1016                 charset = locale.getdefaultlocale()[1]
1017                 if charset is None:
1018                         charset = 'ascii'
1019                 fd = FileDownloader({
1020                         'usenetrc': opts.usenetrc,
1021                         'username': opts.username,
1022                         'password': opts.password,
1023                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1024                         'forceurl': opts.geturl,
1025                         'forcetitle': opts.gettitle,
1026                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1027                         'format': opts.format,
1028                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1029                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1030                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1031                                 or u'%(id)s.%(ext)s'),
1032                         'ignoreerrors': opts.ignoreerrors,
1033                         'ratelimit': opts.ratelimit,
1034                         })
1035                 fd.add_info_extractor(youtube_search_ie)
1036                 fd.add_info_extractor(youtube_pl_ie)
1037                 fd.add_info_extractor(metacafe_ie)
1038                 fd.add_info_extractor(youtube_ie)
1039                 retcode = fd.download(all_urls)
1040                 sys.exit(retcode)
1041
1042         except DownloadError:
1043                 sys.exit(1)
1044         except SameFileError:
1045                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1046         except KeyboardInterrupt:
1047                 sys.exit(u'\nERROR: Interrupted by user')