2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
22 # parse_qs was moved from the cgi module to the urlparse module recently.
24 from urlparse import parse_qs
26 from cgi import parse_qs
29 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
30 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
31 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
32 'Accept-Language': 'en-us,en;q=0.5',
35 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37 def preferredencoding():
38 """Get preferred encoding.
40 Returns the best encoding scheme for the system, based on
41 locale.getpreferredencoding() and some further tweaks.
43 def yield_preferredencoding():
45 pref = locale.getpreferredencoding()
51 return yield_preferredencoding().next()
53 class DownloadError(Exception):
54 """Download Error exception.
56 This exception may be thrown by FileDownloader objects if they are not
57 configured to continue on errors. They will contain the appropriate
62 class SameFileError(Exception):
63 """Same File exception.
65 This exception will be thrown by FileDownloader objects if they detect
66 multiple files would have to be downloaded to the same file on disk.
70 class PostProcessingError(Exception):
71 """Post Processing exception.
73 This exception may be raised by PostProcessor's .run() method to
74 indicate an error in the postprocessing task.
78 class UnavailableFormatError(Exception):
79 """Unavailable Format exception.
81 This exception will be thrown when a video is requested
82 in a format that is not available for that video.
86 class ContentTooShortError(Exception):
87 """Content Too Short exception.
89 This exception may be raised by FileDownloader objects when a file they
90 download is too small for what the server announced first, indicating
91 the connection was probably interrupted.
97 def __init__(self, downloaded, expected):
98 self.downloaded = downloaded
99 self.expected = expected
101 class FileDownloader(object):
102 """File Downloader class.
104 File downloader objects are the ones responsible of downloading the
105 actual video file and writing it to disk if the user has requested
106 it, among some other tasks. In most cases there should be one per
107 program. As, given a video URL, the downloader doesn't know how to
108 extract all the needed information, task that InfoExtractors do, it
109 has to pass the URL to one of them.
111 For this, file downloader objects have a method that allows
112 InfoExtractors to be registered in a given order. When it is passed
113 a URL, the file downloader handles it to the first InfoExtractor it
114 finds that reports being able to handle it. The InfoExtractor extracts
115 all the information about the video or videos the URL refers to, and
116 asks the FileDownloader to process the video information, possibly
117 downloading the video.
119 File downloaders accept a lot of parameters. In order not to saturate
120 the object constructor with arguments, it receives a dictionary of
121 options instead. These options are available through the params
122 attribute for the InfoExtractors to use. The FileDownloader also
123 registers itself as the downloader in charge for the InfoExtractors
124 that are added to it, so this is a "mutual registration".
128 username: Username for authentication purposes.
129 password: Password for authentication purposes.
130 usenetrc: Use netrc for authentication instead.
131 quiet: Do not print messages to stdout.
132 forceurl: Force printing final URL.
133 forcetitle: Force printing title.
134 simulate: Do not download the video files.
135 format: Video format code.
136 outtmpl: Template for output names.
137 ignoreerrors: Do not stop on download errors.
138 ratelimit: Download speed limit, in bytes/sec.
139 nooverwrites: Prevent overwriting files.
140 continuedl: Try to continue downloads if possible.
146 _download_retcode = None
148 def __init__(self, params):
149 """Create a FileDownloader object with the given options."""
152 self._download_retcode = 0
156 def pmkdir(filename):
157 """Create directory components in filename. Similar to Unix "mkdir -p"."""
158 components = filename.split(os.sep)
159 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
160 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
161 for dir in aggregate:
162 if not os.path.exists(dir):
166 def format_bytes(bytes):
169 if type(bytes) is str:
174 exponent = long(math.log(bytes, 1024.0))
175 suffix = 'bkMGTPEZY'[exponent]
176 converted = float(bytes) / float(1024**exponent)
177 return '%.2f%s' % (converted, suffix)
180 def calc_percent(byte_counter, data_len):
183 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
186 def calc_eta(start, now, total, current):
190 if current == 0 or dif < 0.001: # One millisecond
192 rate = float(current) / dif
193 eta = long((float(total) - float(current)) / rate)
194 (eta_mins, eta_secs) = divmod(eta, 60)
197 return '%02d:%02d' % (eta_mins, eta_secs)
200 def calc_speed(start, now, bytes):
202 if bytes == 0 or dif < 0.001: # One millisecond
203 return '%10s' % '---b/s'
204 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
207 def best_block_size(elapsed_time, bytes):
208 new_min = max(bytes / 2.0, 1.0)
209 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
210 if elapsed_time < 0.001:
212 rate = bytes / elapsed_time
220 def parse_bytes(bytestr):
221 """Parse a string indicating a byte quantity into a long integer."""
222 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
225 number = float(matchobj.group(1))
226 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
227 return long(round(number * multiplier))
231 """Verify a URL is valid and data could be downloaded. Return real data URL."""
232 request = urllib2.Request(url, None, std_headers)
233 data = urllib2.urlopen(request)
239 def add_info_extractor(self, ie):
240 """Add an InfoExtractor object to the end of the list."""
242 ie.set_downloader(self)
244 def add_post_processor(self, pp):
245 """Add a PostProcessor object to the end of the chain."""
247 pp.set_downloader(self)
249 def to_stdout(self, message, skip_eol=False):
250 """Print message to stdout if not in quiet mode."""
251 if not self.params.get('quiet', False):
252 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
255 def to_stderr(self, message):
256 """Print message to stderr."""
257 print >>sys.stderr, message.encode(preferredencoding())
259 def fixed_template(self):
260 """Checks if the output template is fixed."""
261 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
263 def trouble(self, message=None):
264 """Determine action to take when a download problem appears.
266 Depending on if the downloader has been configured to ignore
267 download errors or not, this method may throw an exception or
268 not when errors are found, after printing the message.
270 if message is not None:
271 self.to_stderr(message)
272 if not self.params.get('ignoreerrors', False):
273 raise DownloadError(message)
274 self._download_retcode = 1
276 def slow_down(self, start_time, byte_counter):
277 """Sleep if the download speed is over the rate limit."""
278 rate_limit = self.params.get('ratelimit', None)
279 if rate_limit is None or byte_counter == 0:
282 elapsed = now - start_time
285 speed = float(byte_counter) / elapsed
286 if speed > rate_limit:
287 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
289 def report_destination(self, filename):
290 """Report destination filename."""
291 self.to_stdout(u'[download] Destination: %s' % filename)
293 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
294 """Report download progress."""
295 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
296 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
298 def report_resuming_byte(self, resume_len):
299 """Report attemtp to resume at given byte."""
300 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
302 def report_file_already_downloaded(self, file_name):
303 """Report file has already been fully downloaded."""
304 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
306 def report_unable_to_resume(self):
307 """Report it was impossible to resume download."""
308 self.to_stdout(u'[download] Unable to resume')
310 def report_finish(self):
311 """Report download finished."""
314 def process_info(self, info_dict):
315 """Process a single dictionary returned by an InfoExtractor."""
316 # Do nothing else if in simulate mode
317 if self.params.get('simulate', False):
319 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
320 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
321 raise UnavailableFormatError
324 if self.params.get('forcetitle', False):
325 print info_dict['title'].encode(preferredencoding())
326 if self.params.get('forceurl', False):
327 print info_dict['url'].encode(preferredencoding())
332 template_dict = dict(info_dict)
333 template_dict['epoch'] = unicode(long(time.time()))
334 filename = self.params['outtmpl'] % template_dict
335 except (ValueError, KeyError), err:
336 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
337 if self.params.get('nooverwrites', False) and os.path.exists(filename):
338 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
342 self.pmkdir(filename)
343 except (OSError, IOError), err:
344 self.trouble('ERROR: unable to create directories: %s' % str(err))
348 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
349 except (OSError, IOError), err:
350 raise UnavailableFormatError
351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
352 self.trouble('ERROR: unable to download video data: %s' % str(err))
354 except (ContentTooShortError, ), err:
355 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
360 self.post_process(filename, info_dict)
361 except (PostProcessingError), err:
362 self.trouble('ERROR: postprocessing: %s' % str(err))
365 def download(self, url_list):
366 """Download a given list of URLs."""
367 if len(url_list) > 1 and self.fixed_template():
368 raise SameFileError(self.params['outtmpl'])
371 suitable_found = False
373 # Go to next InfoExtractor if not suitable
374 if not ie.suitable(url):
377 # Suitable InfoExtractor found
378 suitable_found = True
380 # Extract information from URL and process it
383 # Suitable InfoExtractor had been found; go to next URL
386 if not suitable_found:
387 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
389 return self._download_retcode
391 def post_process(self, filename, ie_info):
392 """Run the postprocessing chain on the given file."""
394 info['filepath'] = filename
400 def _download_with_rtmpdump(self, filename, url):
401 self.report_destination(filename)
403 # Check for rtmpdump first
405 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
406 except (OSError, IOError):
407 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
410 # Download using rtmpdump. rtmpdump returns exit code 2 when
411 # the connection was interrumpted and resuming appears to be
412 # possible. This is part of rtmpdump's normal usage, AFAIK.
413 retval = subprocess.call(['rtmpdump', '-q', '-r', url, '-o', filename] + [[], ['-e']][self.params.get('continuedl', False)])
415 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
416 time.sleep(2.0) # This seems to be needed
417 retval = subprocess.call(['rtmpdump', '-q', '-e', '-r', url, '-o', filename])
419 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
422 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
425 def _do_download(self, filename, url):
426 # Attempt to download using rtmpdump
427 if url.startswith('rtmp'):
428 return self._download_with_rtmpdump(filename, url)
432 basic_request = urllib2.Request(url, None, std_headers)
433 request = urllib2.Request(url, None, std_headers)
435 # Establish possible resume length
436 if os.path.isfile(filename):
437 resume_len = os.path.getsize(filename)
441 # Request parameters in case of being able to resume
442 if self.params.get('continuedl', False) and resume_len != 0:
443 self.report_resuming_byte(resume_len)
444 request.add_header('Range','bytes=%d-' % resume_len)
447 # Establish connection
449 data = urllib2.urlopen(request)
450 except (urllib2.HTTPError, ), err:
451 if err.code != 416: # 416 is 'Requested range not satisfiable'
454 data = urllib2.urlopen(basic_request)
455 content_length = data.info()['Content-Length']
457 if content_length is not None and long(content_length) == resume_len:
458 # Because the file had already been fully downloaded
459 self.report_file_already_downloaded(filename)
462 # Because the server didn't let us
463 self.report_unable_to_resume()
466 data_len = data.info().get('Content-length', None)
467 data_len_str = self.format_bytes(data_len)
474 data_block = data.read(block_size)
476 data_block_len = len(data_block)
477 if data_block_len == 0:
479 byte_counter += data_block_len
481 # Open file just in time
484 stream = open(filename, open_mode)
485 self.report_destination(filename)
486 except (OSError, IOError), err:
487 self.trouble('ERROR: unable to open for writing: %s' % str(err))
489 stream.write(data_block)
490 block_size = self.best_block_size(after - before, data_block_len)
493 percent_str = self.calc_percent(byte_counter, data_len)
494 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
495 speed_str = self.calc_speed(start, time.time(), byte_counter)
496 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
499 self.slow_down(start, byte_counter)
502 if data_len is not None and str(byte_counter) != data_len:
503 raise ContentTooShortError(byte_counter, long(data_len))
506 class InfoExtractor(object):
507 """Information Extractor class.
509 Information extractors are the classes that, given a URL, extract
510 information from the video (or videos) the URL refers to. This
511 information includes the real video URL, the video title and simplified
512 title, author and others. The information is stored in a dictionary
513 which is then passed to the FileDownloader. The FileDownloader
514 processes this information possibly downloading the video to the file
515 system, among other possible outcomes. The dictionaries must include
516 the following fields:
518 id: Video identifier.
519 url: Final video URL.
520 uploader: Nickname of the video uploader.
521 title: Literal title.
522 stitle: Simplified title.
523 ext: Video filename extension.
525 Subclasses of this one should re-define the _real_initialize() and
526 _real_extract() methods, as well as the suitable() static method.
527 Probably, they should also be instantiated and added to the main
534 def __init__(self, downloader=None):
535 """Constructor. Receives an optional downloader."""
537 self.set_downloader(downloader)
541 """Receives a URL and returns True if suitable for this IE."""
544 def initialize(self):
545 """Initializes an instance (authentication, etc)."""
547 self._real_initialize()
550 def extract(self, url):
551 """Extracts URL information and returns it in list of dicts."""
553 return self._real_extract(url)
555 def set_downloader(self, downloader):
556 """Sets the downloader for this IE."""
557 self._downloader = downloader
559 def _real_initialize(self):
560 """Real initialization process. Redefine in subclasses."""
563 def _real_extract(self, url):
564 """Real extraction process. Redefine in subclasses."""
567 class YoutubeIE(InfoExtractor):
568 """Information extractor for youtube.com."""
570 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
571 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
572 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
573 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
574 _NETRC_MACHINE = 'youtube'
575 _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
576 _video_extensions = {
586 return (re.match(YoutubeIE._VALID_URL, url) is not None)
589 def htmlentity_transform(matchobj):
590 """Transforms an HTML entity to a Unicode character."""
591 entity = matchobj.group(1)
593 # Known non-numeric HTML entity
594 if entity in htmlentitydefs.name2codepoint:
595 return unichr(htmlentitydefs.name2codepoint[entity])
598 mobj = re.match(ur'(?u)#(x?\d+)', entity)
600 numstr = mobj.group(1)
601 if numstr.startswith(u'x'):
603 numstr = u'0%s' % numstr
606 return unichr(long(numstr, base))
608 # Unknown entity in name, return its literal representation
609 return (u'&%s;' % entity)
611 def report_lang(self):
612 """Report attempt to set language."""
613 self._downloader.to_stdout(u'[youtube] Setting language')
615 def report_login(self):
616 """Report attempt to log in."""
617 self._downloader.to_stdout(u'[youtube] Logging in')
619 def report_age_confirmation(self):
620 """Report attempt to confirm age."""
621 self._downloader.to_stdout(u'[youtube] Confirming age')
623 def report_video_info_webpage_download(self, video_id):
624 """Report attempt to download video info webpage."""
625 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
627 def report_information_extraction(self, video_id):
628 """Report attempt to extract video information."""
629 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
631 def report_unavailable_format(self, video_id, format):
632 """Report extracted video URL."""
633 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
635 def report_rtmp_download(self):
636 """Indicate the download will use the RTMP protocol."""
637 self._downloader.to_stdout(u'[youtube] RTMP download detected')
639 def _real_initialize(self):
640 if self._downloader is None:
645 downloader_params = self._downloader.params
647 # Attempt to use provided username and password or .netrc data
648 if downloader_params.get('username', None) is not None:
649 username = downloader_params['username']
650 password = downloader_params['password']
651 elif downloader_params.get('usenetrc', False):
653 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
658 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
659 except (IOError, netrc.NetrcParseError), err:
660 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
664 request = urllib2.Request(self._LANG_URL, None, std_headers)
667 urllib2.urlopen(request).read()
668 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
669 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
672 # No authentication to be performed
678 'current_form': 'loginForm',
680 'action_login': 'Log In',
681 'username': username,
682 'password': password,
684 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
687 login_results = urllib2.urlopen(request).read()
688 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
689 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
691 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
692 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
698 'action_confirm': 'Confirm',
700 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
702 self.report_age_confirmation()
703 age_results = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
708 def _real_extract(self, url):
709 # Extract video id from URL
710 mobj = re.match(self._VALID_URL, url)
712 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
714 video_id = mobj.group(2)
716 # Downloader parameters
720 if self._downloader is not None:
721 params = self._downloader.params
722 format_param = params.get('format', None)
723 if format_param == '0':
724 format_param = self._available_formats[quality_index]
729 video_extension = self._video_extensions.get(format_param, 'flv')
732 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
733 request = urllib2.Request(video_info_url, None, std_headers)
735 self.report_video_info_webpage_download(video_id)
736 video_info_webpage = urllib2.urlopen(request).read()
737 video_info = parse_qs(video_info_webpage)
738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
739 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
741 self.report_information_extraction(video_id)
744 if 'token' not in video_info:
745 # Attempt to see if YouTube has issued an error message
746 if 'reason' not in video_info:
747 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
748 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
749 stream.write(video_info_webpage)
752 reason = urllib.unquote_plus(video_info['reason'][0])
753 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
755 token = urllib.unquote_plus(video_info['token'][0])
756 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
757 if format_param is not None:
758 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
760 # Check possible RTMP download
761 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
762 self.report_rtmp_download()
763 video_real_url = video_info['conn'][0]
766 if 'author' not in video_info:
767 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
769 video_uploader = urllib.unquote_plus(video_info['author'][0])
772 if 'title' not in video_info:
773 self._downloader.trouble(u'ERROR: unable to extract video title')
775 video_title = urllib.unquote_plus(video_info['title'][0])
776 video_title = video_title.decode('utf-8')
777 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
778 video_title = video_title.replace(os.sep, u'%')
781 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
782 simple_title = simple_title.strip(ur'_')
785 # Process video information
786 self._downloader.process_info({
787 'id': video_id.decode('utf-8'),
788 'url': video_real_url.decode('utf-8'),
789 'uploader': video_uploader.decode('utf-8'),
790 'title': video_title,
791 'stitle': simple_title,
792 'ext': video_extension.decode('utf-8'),
797 except UnavailableFormatError, err:
799 if quality_index == len(self._available_formats) - 1:
800 # I don't ever expect this to happen
801 self._downloader.trouble(u'ERROR: no known formats available for video')
804 self.report_unavailable_format(video_id, format_param)
806 format_param = self._available_formats[quality_index]
809 self._downloader.trouble('ERROR: format not available for video')
813 class MetacafeIE(InfoExtractor):
814 """Information Extractor for metacafe.com."""
816 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
817 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
818 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
821 def __init__(self, youtube_ie, downloader=None):
822 InfoExtractor.__init__(self, downloader)
823 self._youtube_ie = youtube_ie
827 return (re.match(MetacafeIE._VALID_URL, url) is not None)
829 def report_disclaimer(self):
830 """Report disclaimer retrieval."""
831 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
833 def report_age_confirmation(self):
834 """Report attempt to confirm age."""
835 self._downloader.to_stdout(u'[metacafe] Confirming age')
837 def report_download_webpage(self, video_id):
838 """Report webpage download."""
839 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
841 def report_extraction(self, video_id):
842 """Report information extraction."""
843 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
845 def _real_initialize(self):
846 # Retrieve disclaimer
847 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
849 self.report_disclaimer()
850 disclaimer = urllib2.urlopen(request).read()
851 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
852 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
858 'submit': "Continue - I'm over 18",
860 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
862 self.report_age_confirmation()
863 disclaimer = urllib2.urlopen(request).read()
864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
865 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
868 def _real_extract(self, url):
869 # Extract id and simplified title from URL
870 mobj = re.match(self._VALID_URL, url)
872 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
875 video_id = mobj.group(1)
877 # Check if video comes from YouTube
878 mobj2 = re.match(r'^yt-(.*)$', video_id)
879 if mobj2 is not None:
880 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
883 simple_title = mobj.group(2).decode('utf-8')
884 video_extension = 'flv'
886 # Retrieve video webpage to extract further information
887 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
889 self.report_download_webpage(video_id)
890 webpage = urllib2.urlopen(request).read()
891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
892 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
895 # Extract URL, uploader and title from webpage
896 self.report_extraction(video_id)
897 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract media URL')
901 mediaURL = urllib.unquote(mobj.group(1))
903 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
905 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
907 #gdaKey = mobj.group(1)
909 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
913 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract title')
917 video_title = mobj.group(1).decode('utf-8')
919 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
923 video_uploader = mobj.group(1)
926 # Process video information
927 self._downloader.process_info({
928 'id': video_id.decode('utf-8'),
929 'url': video_url.decode('utf-8'),
930 'uploader': video_uploader.decode('utf-8'),
931 'title': video_title,
932 'stitle': simple_title,
933 'ext': video_extension.decode('utf-8'),
935 except UnavailableFormatError:
936 self._downloader.trouble(u'ERROR: format not available for video')
939 class YoutubeSearchIE(InfoExtractor):
940 """Information Extractor for YouTube search queries."""
941 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
942 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
943 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
944 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
946 _max_youtube_results = 1000
948 def __init__(self, youtube_ie, downloader=None):
949 InfoExtractor.__init__(self, downloader)
950 self._youtube_ie = youtube_ie
954 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
956 def report_download_page(self, query, pagenum):
957 """Report attempt to download playlist page with given number."""
958 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
960 def _real_initialize(self):
961 self._youtube_ie.initialize()
963 def _real_extract(self, query):
964 mobj = re.match(self._VALID_QUERY, query)
966 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
969 prefix, query = query.split(':')
972 self._download_n_results(query, 1)
974 elif prefix == 'all':
975 self._download_n_results(query, self._max_youtube_results)
981 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
983 elif n > self._max_youtube_results:
984 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
985 n = self._max_youtube_results
986 self._download_n_results(query, n)
988 except ValueError: # parsing prefix as integer fails
989 self._download_n_results(query, 1)
992 def _download_n_results(self, query, n):
993 """Downloads a specified number of results for a query"""
1000 self.report_download_page(query, pagenum)
1001 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1002 request = urllib2.Request(result_url, None, std_headers)
1004 page = urllib2.urlopen(request).read()
1005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1006 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1009 # Extract video identifiers
1010 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1011 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1012 if video_id not in already_seen:
1013 video_ids.append(video_id)
1014 already_seen.add(video_id)
1015 if len(video_ids) == n:
1016 # Specified n videos reached
1017 for id in video_ids:
1018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1021 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1022 for id in video_ids:
1023 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1026 pagenum = pagenum + 1
1028 class YoutubePlaylistIE(InfoExtractor):
1029 """Information Extractor for YouTube playlists."""
1031 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1032 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1033 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1034 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1037 def __init__(self, youtube_ie, downloader=None):
1038 InfoExtractor.__init__(self, downloader)
1039 self._youtube_ie = youtube_ie
1043 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1045 def report_download_page(self, playlist_id, pagenum):
1046 """Report attempt to download playlist page with given number."""
1047 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1049 def _real_initialize(self):
1050 self._youtube_ie.initialize()
1052 def _real_extract(self, url):
1053 # Extract playlist id
1054 mobj = re.match(self._VALID_URL, url)
1056 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1059 # Download playlist pages
1060 playlist_id = mobj.group(1)
1065 self.report_download_page(playlist_id, pagenum)
1066 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1068 page = urllib2.urlopen(request).read()
1069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1070 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1073 # Extract video identifiers
1075 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1076 if mobj.group(1) not in ids_in_page:
1077 ids_in_page.append(mobj.group(1))
1078 video_ids.extend(ids_in_page)
1080 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1082 pagenum = pagenum + 1
1084 for id in video_ids:
1085 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1088 class YoutubeUserIE(InfoExtractor):
1089 """Information Extractor for YouTube users."""
1091 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1092 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1093 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1096 def __init__(self, youtube_ie, downloader=None):
1097 InfoExtractor.__init__(self, downloader)
1098 self._youtube_ie = youtube_ie
1102 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1104 def report_download_page(self, username):
1105 """Report attempt to download user page."""
1106 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1108 def _real_initialize(self):
1109 self._youtube_ie.initialize()
1111 def _real_extract(self, url):
1113 mobj = re.match(self._VALID_URL, url)
1115 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1118 # Download user page
1119 username = mobj.group(1)
1123 self.report_download_page(username)
1124 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1126 page = urllib2.urlopen(request).read()
1127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1131 # Extract video identifiers
1134 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1135 if mobj.group(1) not in ids_in_page:
1136 ids_in_page.append(mobj.group(1))
1137 video_ids.extend(ids_in_page)
1139 for id in video_ids:
1140 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1143 class PostProcessor(object):
1144 """Post Processor class.
1146 PostProcessor objects can be added to downloaders with their
1147 add_post_processor() method. When the downloader has finished a
1148 successful download, it will take its internal chain of PostProcessors
1149 and start calling the run() method on each one of them, first with
1150 an initial argument and then with the returned value of the previous
1153 The chain will be stopped if one of them ever returns None or the end
1154 of the chain is reached.
1156 PostProcessor objects follow a "mutual registration" process similar
1157 to InfoExtractor objects.
1162 def __init__(self, downloader=None):
1163 self._downloader = downloader
1165 def set_downloader(self, downloader):
1166 """Sets the downloader for this PP."""
1167 self._downloader = downloader
1169 def run(self, information):
1170 """Run the PostProcessor.
1172 The "information" argument is a dictionary like the ones
1173 composed by InfoExtractors. The only difference is that this
1174 one has an extra field called "filepath" that points to the
1177 When this method returns None, the postprocessing chain is
1178 stopped. However, this method may return an information
1179 dictionary that will be passed to the next postprocessing
1180 object in the chain. It can be the one it received after
1181 changing some fields.
1183 In addition, this method may raise a PostProcessingError
1184 exception that will be taken into account by the downloader
1187 return information # by default, do nothing
1189 ### MAIN PROGRAM ###
1190 if __name__ == '__main__':
1192 # Modules needed only when running the main program
1196 # Function to update the program file with the latest version from bitbucket.org
1197 def update_self(downloader, filename):
1198 # Note: downloader only used for options
1199 if not os.access (filename, os.W_OK):
1200 sys.exit('ERROR: no write permissions on %s' % filename)
1202 downloader.to_stdout('Updating to latest stable version...')
1203 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1204 latest_version = urllib.urlopen(latest_url).read().strip()
1205 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1206 newcontent = urllib.urlopen(prog_url).read()
1207 stream = open(filename, 'w')
1208 stream.write(newcontent)
1210 downloader.to_stdout('Updated to version %s' % latest_version)
1212 # General configuration
1213 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1214 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1215 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1217 # Parse command line
1218 parser = optparse.OptionParser(
1219 usage='Usage: %prog [options] url...',
1220 version='2010.01.05',
1221 conflict_handler='resolve',
1224 parser.add_option('-h', '--help',
1225 action='help', help='print this help text and exit')
1226 parser.add_option('-v', '--version',
1227 action='version', help='print program version and exit')
1228 parser.add_option('-U', '--update',
1229 action='store_true', dest='update_self', help='update this program to latest stable version')
1230 parser.add_option('-i', '--ignore-errors',
1231 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1232 parser.add_option('-r', '--rate-limit',
1233 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1235 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1236 authentication.add_option('-u', '--username',
1237 dest='username', metavar='UN', help='account username')
1238 authentication.add_option('-p', '--password',
1239 dest='password', metavar='PW', help='account password')
1240 authentication.add_option('-n', '--netrc',
1241 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1242 parser.add_option_group(authentication)
1244 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1245 video_format.add_option('-f', '--format',
1246 action='store', dest='format', metavar='FMT', help='video format code')
1247 video_format.add_option('-b', '--best-quality',
1248 action='store_const', dest='format', help='download the best quality video possible', const='0')
1249 video_format.add_option('-m', '--mobile-version',
1250 action='store_const', dest='format', help='alias for -f 17', const='17')
1251 video_format.add_option('-d', '--high-def',
1252 action='store_const', dest='format', help='alias for -f 22', const='22')
1253 parser.add_option_group(video_format)
1255 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1256 verbosity.add_option('-q', '--quiet',
1257 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1258 verbosity.add_option('-s', '--simulate',
1259 action='store_true', dest='simulate', help='do not download video', default=False)
1260 verbosity.add_option('-g', '--get-url',
1261 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1262 verbosity.add_option('-e', '--get-title',
1263 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1264 parser.add_option_group(verbosity)
1266 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1267 filesystem.add_option('-t', '--title',
1268 action='store_true', dest='usetitle', help='use title in file name', default=False)
1269 filesystem.add_option('-l', '--literal',
1270 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1271 filesystem.add_option('-o', '--output',
1272 dest='outtmpl', metavar='TPL', help='output filename template')
1273 filesystem.add_option('-a', '--batch-file',
1274 dest='batchfile', metavar='F', help='file containing URLs to download')
1275 filesystem.add_option('-w', '--no-overwrites',
1276 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1277 filesystem.add_option('-c', '--continue',
1278 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1279 parser.add_option_group(filesystem)
1281 (opts, args) = parser.parse_args()
1283 # Batch file verification
1285 if opts.batchfile is not None:
1287 batchurls = open(opts.batchfile, 'r').readlines()
1288 batchurls = [x.strip() for x in batchurls]
1289 batchurls = [x for x in batchurls if len(x) > 0]
1291 sys.exit(u'ERROR: batch file could not be read')
1292 all_urls = batchurls + args
1294 # Conflicting, missing and erroneous options
1295 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1296 parser.error(u'using .netrc conflicts with giving username/password')
1297 if opts.password is not None and opts.username is None:
1298 parser.error(u'account username missing')
1299 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1300 parser.error(u'using output template conflicts with using title or literal title')
1301 if opts.usetitle and opts.useliteral:
1302 parser.error(u'using title conflicts with using literal title')
1303 if opts.username is not None and opts.password is None:
1304 opts.password = getpass.getpass(u'Type account password and press return:')
1305 if opts.ratelimit is not None:
1306 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1307 if numeric_limit is None:
1308 parser.error(u'invalid rate limit specified')
1309 opts.ratelimit = numeric_limit
1311 # Information extractors
1312 youtube_ie = YoutubeIE()
1313 metacafe_ie = MetacafeIE(youtube_ie)
1314 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1315 youtube_user_ie = YoutubeUserIE(youtube_ie)
1316 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1319 fd = FileDownloader({
1320 'usenetrc': opts.usenetrc,
1321 'username': opts.username,
1322 'password': opts.password,
1323 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1324 'forceurl': opts.geturl,
1325 'forcetitle': opts.gettitle,
1326 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1327 'format': opts.format,
1328 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1329 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1330 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1331 or u'%(id)s.%(ext)s'),
1332 'ignoreerrors': opts.ignoreerrors,
1333 'ratelimit': opts.ratelimit,
1334 'nooverwrites': opts.nooverwrites,
1335 'continuedl': opts.continue_dl,
1337 fd.add_info_extractor(youtube_search_ie)
1338 fd.add_info_extractor(youtube_pl_ie)
1339 fd.add_info_extractor(youtube_user_ie)
1340 fd.add_info_extractor(metacafe_ie)
1341 fd.add_info_extractor(youtube_ie)
1344 if opts.update_self:
1345 update_self(fd, sys.argv[0])
1348 if len(all_urls) < 1:
1349 if not opts.update_self:
1350 parser.error(u'you must provide at least one URL')
1353 retcode = fd.download(all_urls)
1356 except DownloadError:
1358 except SameFileError:
1359 sys.exit(u'ERROR: fixed output name but more than one file to download')
1360 except KeyboardInterrupt:
1361 sys.exit(u'\nERROR: Interrupted by user')