2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
25 # parse_qs was moved from the cgi module to the urlparse module recently.
27 from urlparse import parse_qs
29 from cgi import parse_qs
32 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.11) Gecko/20101019 Firefox/3.6.11',
33 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
34 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35 'Accept-Language': 'en-us,en;q=0.5',
38 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40 def preferredencoding():
41 """Get preferred encoding.
43 Returns the best encoding scheme for the system, based on
44 locale.getpreferredencoding() and some further tweaks.
46 def yield_preferredencoding():
48 pref = locale.getpreferredencoding()
54 return yield_preferredencoding().next()
56 def htmlentity_transform(matchobj):
57 """Transforms an HTML entity to a Unicode character.
59 This function receives a match object and is intended to be used with
60 the re.sub() function.
62 entity = matchobj.group(1)
64 # Known non-numeric HTML entity
65 if entity in htmlentitydefs.name2codepoint:
66 return unichr(htmlentitydefs.name2codepoint[entity])
69 mobj = re.match(ur'(?u)#(x?\d+)', entity)
71 numstr = mobj.group(1)
72 if numstr.startswith(u'x'):
74 numstr = u'0%s' % numstr
77 return unichr(long(numstr, base))
79 # Unknown entity in name, return its literal representation
80 return (u'&%s;' % entity)
82 def sanitize_title(utitle):
83 """Sanitizes a video title so it could be used as part of a filename."""
84 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
85 return utitle.replace(unicode(os.sep), u'%')
87 def sanitize_open(filename, open_mode):
88 """Try to open the given filename, and slightly tweak it if this fails.
90 Attempts to open the given filename. If this fails, it tries to change
91 the filename slightly, step by step, until it's either able to open it
92 or it fails and raises a final exception, like the standard open()
95 It returns the tuple (stream, definitive_file_name).
99 if sys.platform == 'win32':
101 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
102 return (sys.stdout, filename)
103 stream = open(filename, open_mode)
104 return (stream, filename)
105 except (IOError, OSError), err:
106 # In case of error, try to remove win32 forbidden chars
107 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109 # An exception here should be caught in the caller
110 stream = open(filename, open_mode)
111 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def format_bytes(bytes):
242 if type(bytes) is str:
247 exponent = long(math.log(bytes, 1024.0))
248 suffix = 'bkMGTPEZY'[exponent]
249 converted = float(bytes) / float(1024**exponent)
250 return '%.2f%s' % (converted, suffix)
253 def calc_percent(byte_counter, data_len):
256 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
259 def calc_eta(start, now, total, current):
263 if current == 0 or dif < 0.001: # One millisecond
265 rate = float(current) / dif
266 eta = long((float(total) - float(current)) / rate)
267 (eta_mins, eta_secs) = divmod(eta, 60)
270 return '%02d:%02d' % (eta_mins, eta_secs)
273 def calc_speed(start, now, bytes):
275 if bytes == 0 or dif < 0.001: # One millisecond
276 return '%10s' % '---b/s'
277 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
280 def best_block_size(elapsed_time, bytes):
281 new_min = max(bytes / 2.0, 1.0)
282 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
283 if elapsed_time < 0.001:
285 rate = bytes / elapsed_time
293 def parse_bytes(bytestr):
294 """Parse a string indicating a byte quantity into a long integer."""
295 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
298 number = float(matchobj.group(1))
299 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
300 return long(round(number * multiplier))
302 def add_info_extractor(self, ie):
303 """Add an InfoExtractor object to the end of the list."""
305 ie.set_downloader(self)
307 def add_post_processor(self, pp):
308 """Add a PostProcessor object to the end of the chain."""
310 pp.set_downloader(self)
312 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
313 """Print message to stdout if not in quiet mode."""
315 if not self.params.get('quiet', False):
316 terminator = [u'\n', u''][skip_eol]
317 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
318 self._screen_file.flush()
319 except (UnicodeEncodeError), err:
320 if not ignore_encoding_errors:
323 def to_stderr(self, message):
324 """Print message to stderr."""
325 print >>sys.stderr, message.encode(preferredencoding())
327 def fixed_template(self):
328 """Checks if the output template is fixed."""
329 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
331 def trouble(self, message=None):
332 """Determine action to take when a download problem appears.
334 Depending on if the downloader has been configured to ignore
335 download errors or not, this method may throw an exception or
336 not when errors are found, after printing the message.
338 if message is not None:
339 self.to_stderr(message)
340 if not self.params.get('ignoreerrors', False):
341 raise DownloadError(message)
342 self._download_retcode = 1
344 def slow_down(self, start_time, byte_counter):
345 """Sleep if the download speed is over the rate limit."""
346 rate_limit = self.params.get('ratelimit', None)
347 if rate_limit is None or byte_counter == 0:
350 elapsed = now - start_time
353 speed = float(byte_counter) / elapsed
354 if speed > rate_limit:
355 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
357 def report_destination(self, filename):
358 """Report destination filename."""
359 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
361 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
362 """Report download progress."""
363 if self.params.get('noprogress', False):
365 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
366 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
368 def report_resuming_byte(self, resume_len):
369 """Report attempt to resume at given byte."""
370 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
372 def report_retry(self, count, retries):
373 """Report retry in case of HTTP error 5xx"""
374 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
376 def report_file_already_downloaded(self, file_name):
377 """Report file has already been fully downloaded."""
379 self.to_screen(u'[download] %s has already been downloaded' % file_name)
380 except (UnicodeEncodeError), err:
381 self.to_screen(u'[download] The file has already been downloaded')
383 def report_unable_to_resume(self):
384 """Report it was impossible to resume download."""
385 self.to_screen(u'[download] Unable to resume')
387 def report_finish(self):
388 """Report download finished."""
389 if self.params.get('noprogress', False):
390 self.to_screen(u'[download] Download completed')
394 def increment_downloads(self):
395 """Increment the ordinal that assigns a number to each file."""
396 self._num_downloads += 1
398 def process_info(self, info_dict):
399 """Process a single dictionary returned by an InfoExtractor."""
400 # Do nothing else if in simulate mode
401 if self.params.get('simulate', False):
403 if self.params.get('forcetitle', False):
404 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
405 if self.params.get('forceurl', False):
406 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
407 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
408 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
409 if self.params.get('forcedescription', False) and 'description' in info_dict:
410 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
415 template_dict = dict(info_dict)
416 template_dict['epoch'] = unicode(long(time.time()))
417 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
418 filename = self.params['outtmpl'] % template_dict
419 except (ValueError, KeyError), err:
420 self.trouble(u'ERROR: invalid system charset or erroneous output template')
422 if self.params.get('nooverwrites', False) and os.path.exists(filename):
423 self.to_stderr(u'WARNING: file exists and will be skipped')
427 self.pmkdir(filename)
428 except (OSError, IOError), err:
429 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
433 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
434 except (OSError, IOError), err:
435 raise UnavailableVideoError
436 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
437 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
439 except (ContentTooShortError, ), err:
440 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
445 self.post_process(filename, info_dict)
446 except (PostProcessingError), err:
447 self.trouble(u'ERROR: postprocessing: %s' % str(err))
450 def download(self, url_list):
451 """Download a given list of URLs."""
452 if len(url_list) > 1 and self.fixed_template():
453 raise SameFileError(self.params['outtmpl'])
456 suitable_found = False
458 # Go to next InfoExtractor if not suitable
459 if not ie.suitable(url):
462 # Suitable InfoExtractor found
463 suitable_found = True
465 # Extract information from URL and process it
468 # Suitable InfoExtractor had been found; go to next URL
471 if not suitable_found:
472 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
474 return self._download_retcode
476 def post_process(self, filename, ie_info):
477 """Run the postprocessing chain on the given file."""
479 info['filepath'] = filename
485 def _download_with_rtmpdump(self, filename, url, player_url):
486 self.report_destination(filename)
488 # Check for rtmpdump first
490 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
491 except (OSError, IOError):
492 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
495 # Download using rtmpdump. rtmpdump returns exit code 2 when
496 # the connection was interrumpted and resuming appears to be
497 # possible. This is part of rtmpdump's normal usage, AFAIK.
498 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
499 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
500 while retval == 2 or retval == 1:
501 prevsize = os.path.getsize(filename)
502 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
503 time.sleep(5.0) # This seems to be needed
504 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
505 cursize = os.path.getsize(filename)
506 if prevsize == cursize and retval == 1:
509 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
512 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
515 def _do_download(self, filename, url, player_url):
516 # Attempt to download using rtmpdump
517 if url.startswith('rtmp'):
518 return self._download_with_rtmpdump(filename, url, player_url)
522 basic_request = urllib2.Request(url, None, std_headers)
523 request = urllib2.Request(url, None, std_headers)
525 # Establish possible resume length
526 if os.path.isfile(filename):
527 resume_len = os.path.getsize(filename)
531 # Request parameters in case of being able to resume
532 if self.params.get('continuedl', False) and resume_len != 0:
533 self.report_resuming_byte(resume_len)
534 request.add_header('Range','bytes=%d-' % resume_len)
538 retries = self.params.get('retries', 0)
539 while count <= retries:
540 # Establish connection
542 data = urllib2.urlopen(request)
544 except (urllib2.HTTPError, ), err:
545 if (err.code < 500 or err.code >= 600) and err.code != 416:
546 # Unexpected HTTP error
548 elif err.code == 416:
549 # Unable to resume (requested range not satisfiable)
551 # Open the connection again without the range header
552 data = urllib2.urlopen(basic_request)
553 content_length = data.info()['Content-Length']
554 except (urllib2.HTTPError, ), err:
555 if err.code < 500 or err.code >= 600:
558 # Examine the reported length
559 if (content_length is not None and
560 (resume_len - 100 < long(content_length) < resume_len + 100)):
561 # The file had already been fully downloaded.
562 # Explanation to the above condition: in issue #175 it was revealed that
563 # YouTube sometimes adds or removes a few bytes from the end of the file,
564 # changing the file size slightly and causing problems for some users. So
565 # I decided to implement a suggested change and consider the file
566 # completely downloaded if the file size differs less than 100 bytes from
567 # the one in the hard drive.
568 self.report_file_already_downloaded(filename)
571 # The length does not match, we start the download over
572 self.report_unable_to_resume()
578 self.report_retry(count, retries)
581 self.trouble(u'ERROR: giving up after %s retries' % retries)
584 data_len = data.info().get('Content-length', None)
585 data_len_str = self.format_bytes(data_len)
592 data_block = data.read(block_size)
594 data_block_len = len(data_block)
595 if data_block_len == 0:
597 byte_counter += data_block_len
599 # Open file just in time
602 (stream, filename) = sanitize_open(filename, open_mode)
603 self.report_destination(filename)
604 except (OSError, IOError), err:
605 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
608 stream.write(data_block)
609 except (IOError, OSError), err:
610 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
612 block_size = self.best_block_size(after - before, data_block_len)
615 percent_str = self.calc_percent(byte_counter, data_len)
616 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
617 speed_str = self.calc_speed(start, time.time(), byte_counter)
618 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
621 self.slow_down(start, byte_counter)
624 if data_len is not None and str(byte_counter) != data_len:
625 raise ContentTooShortError(byte_counter, long(data_len))
628 class InfoExtractor(object):
629 """Information Extractor class.
631 Information extractors are the classes that, given a URL, extract
632 information from the video (or videos) the URL refers to. This
633 information includes the real video URL, the video title and simplified
634 title, author and others. The information is stored in a dictionary
635 which is then passed to the FileDownloader. The FileDownloader
636 processes this information possibly downloading the video to the file
637 system, among other possible outcomes. The dictionaries must include
638 the following fields:
640 id: Video identifier.
641 url: Final video URL.
642 uploader: Nickname of the video uploader.
643 title: Literal title.
644 stitle: Simplified title.
645 ext: Video filename extension.
646 format: Video format.
647 player_url: SWF Player URL (may be None).
649 The following fields are optional. Their primary purpose is to allow
650 youtube-dl to serve as the backend for a video search function, such
651 as the one in youtube2mp3. They are only used when their respective
652 forced printing functions are called:
654 thumbnail: Full URL to a video thumbnail image.
655 description: One-line video description.
657 Subclasses of this one should re-define the _real_initialize() and
658 _real_extract() methods, as well as the suitable() static method.
659 Probably, they should also be instantiated and added to the main
666 def __init__(self, downloader=None):
667 """Constructor. Receives an optional downloader."""
669 self.set_downloader(downloader)
673 """Receives a URL and returns True if suitable for this IE."""
676 def initialize(self):
677 """Initializes an instance (authentication, etc)."""
679 self._real_initialize()
682 def extract(self, url):
683 """Extracts URL information and returns it in list of dicts."""
685 return self._real_extract(url)
687 def set_downloader(self, downloader):
688 """Sets the downloader for this IE."""
689 self._downloader = downloader
691 def _real_initialize(self):
692 """Real initialization process. Redefine in subclasses."""
695 def _real_extract(self, url):
696 """Real extraction process. Redefine in subclasses."""
699 class YoutubeIE(InfoExtractor):
700 """Information extractor for youtube.com."""
702 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
703 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
704 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
705 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
706 _NETRC_MACHINE = 'youtube'
707 # Listed in order of quality
708 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
709 _video_extensions = {
715 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
722 return (re.match(YoutubeIE._VALID_URL, url) is not None)
724 def report_lang(self):
725 """Report attempt to set language."""
726 self._downloader.to_screen(u'[youtube] Setting language')
728 def report_login(self):
729 """Report attempt to log in."""
730 self._downloader.to_screen(u'[youtube] Logging in')
732 def report_age_confirmation(self):
733 """Report attempt to confirm age."""
734 self._downloader.to_screen(u'[youtube] Confirming age')
736 def report_video_webpage_download(self, video_id):
737 """Report attempt to download video webpage."""
738 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
740 def report_video_info_webpage_download(self, video_id):
741 """Report attempt to download video info webpage."""
742 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
744 def report_information_extraction(self, video_id):
745 """Report attempt to extract video information."""
746 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
748 def report_unavailable_format(self, video_id, format):
749 """Report extracted video URL."""
750 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
752 def report_rtmp_download(self):
753 """Indicate the download will use the RTMP protocol."""
754 self._downloader.to_screen(u'[youtube] RTMP download detected')
756 def _real_initialize(self):
757 if self._downloader is None:
762 downloader_params = self._downloader.params
764 # Attempt to use provided username and password or .netrc data
765 if downloader_params.get('username', None) is not None:
766 username = downloader_params['username']
767 password = downloader_params['password']
768 elif downloader_params.get('usenetrc', False):
770 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
775 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
776 except (IOError, netrc.NetrcParseError), err:
777 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
781 request = urllib2.Request(self._LANG_URL, None, std_headers)
784 urllib2.urlopen(request).read()
785 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
786 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
789 # No authentication to be performed
795 'current_form': 'loginForm',
797 'action_login': 'Log In',
798 'username': username,
799 'password': password,
801 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
804 login_results = urllib2.urlopen(request).read()
805 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
806 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
815 'action_confirm': 'Confirm',
817 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
819 self.report_age_confirmation()
820 age_results = urllib2.urlopen(request).read()
821 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
822 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
825 def _real_extract(self, url):
826 # Extract video id from URL
827 mobj = re.match(self._VALID_URL, url)
829 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
831 video_id = mobj.group(2)
834 self.report_video_webpage_download(video_id)
835 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
837 video_webpage = urllib2.urlopen(request).read()
838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
839 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
842 # Attempt to extract SWF player URL
843 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
845 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
850 self.report_video_info_webpage_download(video_id)
851 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
852 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
853 % (video_id, el_type))
854 request = urllib2.Request(video_info_url, None, std_headers)
856 video_info_webpage = urllib2.urlopen(request).read()
857 video_info = parse_qs(video_info_webpage)
858 if 'token' in video_info:
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
863 if 'token' not in video_info:
864 if 'reason' in video_info:
865 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
867 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
870 # Start extracting information
871 self.report_information_extraction(video_id)
874 if 'author' not in video_info:
875 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
877 video_uploader = urllib.unquote_plus(video_info['author'][0])
880 if 'title' not in video_info:
881 self._downloader.trouble(u'ERROR: unable to extract video title')
883 video_title = urllib.unquote_plus(video_info['title'][0])
884 video_title = video_title.decode('utf-8')
885 video_title = sanitize_title(video_title)
888 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
889 simple_title = simple_title.strip(ur'_')
892 if 'thumbnail_url' not in video_info:
893 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
895 else: # don't panic if we can't find it
896 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
900 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
902 upload_date = mobj.group(1).split()
903 format_expressions = ['%d %B %Y', '%B %d, %Y']
904 for expression in format_expressions:
906 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
911 video_description = 'No description available.'
912 if self._downloader.params.get('forcedescription', False):
913 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
915 video_description = mobj.group(1)
918 video_token = urllib.unquote_plus(video_info['token'][0])
920 # Decide which formats to download
921 requested_format = self._downloader.params.get('format', None)
922 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
924 if 'fmt_url_map' in video_info:
925 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
926 format_limit = self._downloader.params.get('format_limit', None)
927 if format_limit is not None and format_limit in self._available_formats:
928 format_list = self._available_formats[self._available_formats.index(format_limit):]
930 format_list = self._available_formats
931 existing_formats = [x for x in format_list if x in url_map]
932 if len(existing_formats) == 0:
933 self._downloader.trouble(u'ERROR: no known formats available for video')
935 if requested_format is None:
936 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
937 elif requested_format == '-1':
938 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
940 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
942 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
943 self.report_rtmp_download()
944 video_url_list = [(None, video_info['conn'][0])]
947 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
950 for format_param, video_real_url in video_url_list:
951 # At this point we have a new video
952 self._downloader.increment_downloads()
955 video_extension = self._video_extensions.get(format_param, 'flv')
957 # Find the video URL in fmt_url_map or conn paramters
959 # Process video information
960 self._downloader.process_info({
961 'id': video_id.decode('utf-8'),
962 'url': video_real_url.decode('utf-8'),
963 'uploader': video_uploader.decode('utf-8'),
964 'uploaddate': upload_date,
965 'title': video_title,
966 'stitle': simple_title,
967 'ext': video_extension.decode('utf-8'),
968 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
969 'thumbnail': video_thumbnail.decode('utf-8'),
970 'description': video_description.decode('utf-8'),
971 'player_url': player_url,
973 except UnavailableVideoError, err:
974 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
977 class MetacafeIE(InfoExtractor):
978 """Information Extractor for metacafe.com."""
980 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
981 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
982 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
985 def __init__(self, youtube_ie, downloader=None):
986 InfoExtractor.__init__(self, downloader)
987 self._youtube_ie = youtube_ie
991 return (re.match(MetacafeIE._VALID_URL, url) is not None)
993 def report_disclaimer(self):
994 """Report disclaimer retrieval."""
995 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
997 def report_age_confirmation(self):
998 """Report attempt to confirm age."""
999 self._downloader.to_screen(u'[metacafe] Confirming age')
1001 def report_download_webpage(self, video_id):
1002 """Report webpage download."""
1003 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1005 def report_extraction(self, video_id):
1006 """Report information extraction."""
1007 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1009 def _real_initialize(self):
1010 # Retrieve disclaimer
1011 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1013 self.report_disclaimer()
1014 disclaimer = urllib2.urlopen(request).read()
1015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1016 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1022 'submit': "Continue - I'm over 18",
1024 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1026 self.report_age_confirmation()
1027 disclaimer = urllib2.urlopen(request).read()
1028 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1029 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1032 def _real_extract(self, url):
1033 # Extract id and simplified title from URL
1034 mobj = re.match(self._VALID_URL, url)
1036 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1039 video_id = mobj.group(1)
1041 # Check if video comes from YouTube
1042 mobj2 = re.match(r'^yt-(.*)$', video_id)
1043 if mobj2 is not None:
1044 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1047 # At this point we have a new video
1048 self._downloader.increment_downloads()
1050 simple_title = mobj.group(2).decode('utf-8')
1052 # Retrieve video webpage to extract further information
1053 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1055 self.report_download_webpage(video_id)
1056 webpage = urllib2.urlopen(request).read()
1057 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1061 # Extract URL, uploader and title from webpage
1062 self.report_extraction(video_id)
1063 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1064 if mobj is not None:
1065 mediaURL = urllib.unquote(mobj.group(1))
1066 video_extension = mediaURL[-3:]
1068 # Extract gdaKey if available
1069 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1071 video_url = mediaURL
1073 gdaKey = mobj.group(1)
1074 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1076 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1078 self._downloader.trouble(u'ERROR: unable to extract media URL')
1080 vardict = parse_qs(mobj.group(1))
1081 if 'mediaData' not in vardict:
1082 self._downloader.trouble(u'ERROR: unable to extract media URL')
1084 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1086 self._downloader.trouble(u'ERROR: unable to extract media URL')
1088 mediaURL = mobj.group(1).replace('\\/', '/')
1089 video_extension = mediaURL[-3:]
1090 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1092 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1094 self._downloader.trouble(u'ERROR: unable to extract title')
1096 video_title = mobj.group(1).decode('utf-8')
1097 video_title = sanitize_title(video_title)
1099 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1101 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1103 video_uploader = mobj.group(1)
1106 # Process video information
1107 self._downloader.process_info({
1108 'id': video_id.decode('utf-8'),
1109 'url': video_url.decode('utf-8'),
1110 'uploader': video_uploader.decode('utf-8'),
1111 'uploaddate': u'NA',
1112 'title': video_title,
1113 'stitle': simple_title,
1114 'ext': video_extension.decode('utf-8'),
1118 except UnavailableVideoError:
1119 self._downloader.trouble(u'ERROR: unable to download video')
1122 class DailymotionIE(InfoExtractor):
1123 """Information Extractor for Dailymotion"""
1125 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1127 def __init__(self, downloader=None):
1128 InfoExtractor.__init__(self, downloader)
1132 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1134 def report_download_webpage(self, video_id):
1135 """Report webpage download."""
1136 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1138 def report_extraction(self, video_id):
1139 """Report information extraction."""
1140 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1142 def _real_initialize(self):
1145 def _real_extract(self, url):
1146 # Extract id and simplified title from URL
1147 mobj = re.match(self._VALID_URL, url)
1149 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1152 # At this point we have a new video
1153 self._downloader.increment_downloads()
1154 video_id = mobj.group(1)
1156 simple_title = mobj.group(2).decode('utf-8')
1157 video_extension = 'flv'
1159 # Retrieve video webpage to extract further information
1160 request = urllib2.Request(url)
1162 self.report_download_webpage(video_id)
1163 webpage = urllib2.urlopen(request).read()
1164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1168 # Extract URL, uploader and title from webpage
1169 self.report_extraction(video_id)
1170 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1172 self._downloader.trouble(u'ERROR: unable to extract media URL')
1174 mediaURL = urllib.unquote(mobj.group(1))
1176 # if needed add http://www.dailymotion.com/ if relative URL
1178 video_url = mediaURL
1180 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1181 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1183 self._downloader.trouble(u'ERROR: unable to extract title')
1185 video_title = mobj.group(1).decode('utf-8')
1186 video_title = sanitize_title(video_title)
1188 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1190 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1192 video_uploader = mobj.group(1)
1195 # Process video information
1196 self._downloader.process_info({
1197 'id': video_id.decode('utf-8'),
1198 'url': video_url.decode('utf-8'),
1199 'uploader': video_uploader.decode('utf-8'),
1200 'uploaddate': u'NA',
1201 'title': video_title,
1202 'stitle': simple_title,
1203 'ext': video_extension.decode('utf-8'),
1207 except UnavailableVideoError:
1208 self._downloader.trouble(u'ERROR: unable to download video')
1210 class GoogleIE(InfoExtractor):
1211 """Information extractor for video.google.com."""
1213 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1220 return (re.match(GoogleIE._VALID_URL, url) is not None)
1222 def report_download_webpage(self, video_id):
1223 """Report webpage download."""
1224 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1230 def _real_initialize(self):
1233 def _real_extract(self, url):
1234 # Extract id from URL
1235 mobj = re.match(self._VALID_URL, url)
1237 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1240 # At this point we have a new video
1241 self._downloader.increment_downloads()
1242 video_id = mobj.group(1)
1244 video_extension = 'mp4'
1246 # Retrieve video webpage to extract further information
1247 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1249 self.report_download_webpage(video_id)
1250 webpage = urllib2.urlopen(request).read()
1251 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1252 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1255 # Extract URL, uploader, and title from webpage
1256 self.report_extraction(video_id)
1257 mobj = re.search(r"download_url:'([^']+)'", webpage)
1259 video_extension = 'flv'
1260 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1262 self._downloader.trouble(u'ERROR: unable to extract media URL')
1264 mediaURL = urllib.unquote(mobj.group(1))
1265 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1266 mediaURL = mediaURL.replace('\\x26', '\x26')
1268 video_url = mediaURL
1270 mobj = re.search(r'<title>(.*)</title>', webpage)
1272 self._downloader.trouble(u'ERROR: unable to extract title')
1274 video_title = mobj.group(1).decode('utf-8')
1275 video_title = sanitize_title(video_title)
1276 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1278 # Extract video description
1279 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1281 self._downloader.trouble(u'ERROR: unable to extract video description')
1283 video_description = mobj.group(1).decode('utf-8')
1284 if not video_description:
1285 video_description = 'No description available.'
1287 # Extract video thumbnail
1288 if self._downloader.params.get('forcethumbnail', False):
1289 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1291 webpage = urllib2.urlopen(request).read()
1292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1293 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1295 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1297 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1299 video_thumbnail = mobj.group(1)
1300 else: # we need something to pass to process_info
1301 video_thumbnail = ''
1305 # Process video information
1306 self._downloader.process_info({
1307 'id': video_id.decode('utf-8'),
1308 'url': video_url.decode('utf-8'),
1310 'uploaddate': u'NA',
1311 'title': video_title,
1312 'stitle': simple_title,
1313 'ext': video_extension.decode('utf-8'),
1317 except UnavailableVideoError:
1318 self._downloader.trouble(u'ERROR: unable to download video')
1321 class PhotobucketIE(InfoExtractor):
1322 """Information extractor for photobucket.com."""
1324 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1326 def __init__(self, downloader=None):
1327 InfoExtractor.__init__(self, downloader)
1331 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1333 def report_download_webpage(self, video_id):
1334 """Report webpage download."""
1335 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1337 def report_extraction(self, video_id):
1338 """Report information extraction."""
1339 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1341 def _real_initialize(self):
1344 def _real_extract(self, url):
1345 # Extract id from URL
1346 mobj = re.match(self._VALID_URL, url)
1348 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1351 # At this point we have a new video
1352 self._downloader.increment_downloads()
1353 video_id = mobj.group(1)
1355 video_extension = 'flv'
1357 # Retrieve video webpage to extract further information
1358 request = urllib2.Request(url)
1360 self.report_download_webpage(video_id)
1361 webpage = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1366 # Extract URL, uploader, and title from webpage
1367 self.report_extraction(video_id)
1368 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1370 self._downloader.trouble(u'ERROR: unable to extract media URL')
1372 mediaURL = urllib.unquote(mobj.group(1))
1374 video_url = mediaURL
1376 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1378 self._downloader.trouble(u'ERROR: unable to extract title')
1380 video_title = mobj.group(1).decode('utf-8')
1381 video_title = sanitize_title(video_title)
1382 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1384 video_uploader = mobj.group(2).decode('utf-8')
1387 # Process video information
1388 self._downloader.process_info({
1389 'id': video_id.decode('utf-8'),
1390 'url': video_url.decode('utf-8'),
1391 'uploader': video_uploader,
1392 'uploaddate': u'NA',
1393 'title': video_title,
1394 'stitle': simple_title,
1395 'ext': video_extension.decode('utf-8'),
1399 except UnavailableVideoError:
1400 self._downloader.trouble(u'ERROR: unable to download video')
1403 class YahooIE(InfoExtractor):
1404 """Information extractor for video.yahoo.com."""
1406 # _VALID_URL matches all Yahoo! Video URLs
1407 # _VPAGE_URL matches only the extractable '/watch/' URLs
1408 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1409 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1411 def __init__(self, downloader=None):
1412 InfoExtractor.__init__(self, downloader)
1416 return (re.match(YahooIE._VALID_URL, url) is not None)
1418 def report_download_webpage(self, video_id):
1419 """Report webpage download."""
1420 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1422 def report_extraction(self, video_id):
1423 """Report information extraction."""
1424 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1426 def _real_initialize(self):
1429 def _real_extract(self, url, new_video=True):
1430 # Extract ID from URL
1431 mobj = re.match(self._VALID_URL, url)
1433 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1436 # At this point we have a new video
1437 self._downloader.increment_downloads()
1438 video_id = mobj.group(2)
1439 video_extension = 'flv'
1441 # Rewrite valid but non-extractable URLs as
1442 # extractable English language /watch/ URLs
1443 if re.match(self._VPAGE_URL, url) is None:
1444 request = urllib2.Request(url)
1446 webpage = urllib2.urlopen(request).read()
1447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1448 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1451 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1453 self._downloader.trouble(u'ERROR: Unable to extract id field')
1455 yahoo_id = mobj.group(1)
1457 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1459 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1461 yahoo_vid = mobj.group(1)
1463 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1464 return self._real_extract(url, new_video=False)
1466 # Retrieve video webpage to extract further information
1467 request = urllib2.Request(url)
1469 self.report_download_webpage(video_id)
1470 webpage = urllib2.urlopen(request).read()
1471 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1475 # Extract uploader and title from webpage
1476 self.report_extraction(video_id)
1477 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1479 self._downloader.trouble(u'ERROR: unable to extract video title')
1481 video_title = mobj.group(1).decode('utf-8')
1482 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1484 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1486 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1488 video_uploader = mobj.group(1).decode('utf-8')
1490 # Extract video thumbnail
1491 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1493 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1495 video_thumbnail = mobj.group(1).decode('utf-8')
1497 # Extract video description
1498 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1500 self._downloader.trouble(u'ERROR: unable to extract video description')
1502 video_description = mobj.group(1).decode('utf-8')
1503 if not video_description: video_description = 'No description available.'
1505 # Extract video height and width
1506 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1508 self._downloader.trouble(u'ERROR: unable to extract video height')
1510 yv_video_height = mobj.group(1)
1512 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1514 self._downloader.trouble(u'ERROR: unable to extract video width')
1516 yv_video_width = mobj.group(1)
1518 # Retrieve video playlist to extract media URL
1519 # I'm not completely sure what all these options are, but we
1520 # seem to need most of them, otherwise the server sends a 401.
1521 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1522 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1523 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1524 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1525 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1527 self.report_download_webpage(video_id)
1528 webpage = urllib2.urlopen(request).read()
1529 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1530 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1533 # Extract media URL from playlist XML
1534 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1536 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1538 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1539 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1542 # Process video information
1543 self._downloader.process_info({
1544 'id': video_id.decode('utf-8'),
1546 'uploader': video_uploader,
1547 'uploaddate': u'NA',
1548 'title': video_title,
1549 'stitle': simple_title,
1550 'ext': video_extension.decode('utf-8'),
1551 'thumbnail': video_thumbnail.decode('utf-8'),
1552 'description': video_description,
1553 'thumbnail': video_thumbnail,
1554 'description': video_description,
1557 except UnavailableVideoError:
1558 self._downloader.trouble(u'ERROR: unable to download video')
1561 class GenericIE(InfoExtractor):
1562 """Generic last-resort information extractor."""
1564 def __init__(self, downloader=None):
1565 InfoExtractor.__init__(self, downloader)
1571 def report_download_webpage(self, video_id):
1572 """Report webpage download."""
1573 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1574 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1576 def report_extraction(self, video_id):
1577 """Report information extraction."""
1578 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1580 def _real_initialize(self):
1583 def _real_extract(self, url):
1584 # At this point we have a new video
1585 self._downloader.increment_downloads()
1587 video_id = url.split('/')[-1]
1588 request = urllib2.Request(url)
1590 self.report_download_webpage(video_id)
1591 webpage = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1595 except ValueError, err:
1596 # since this is the last-resort InfoExtractor, if
1597 # this error is thrown, it'll be thrown here
1598 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1601 # Start with something easy: JW Player in SWFObject
1602 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1604 # Broaden the search a little bit
1605 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1607 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1610 # It's possible that one of the regexes
1611 # matched, but returned an empty group:
1612 if mobj.group(1) is None:
1613 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1616 video_url = urllib.unquote(mobj.group(1))
1617 video_id = os.path.basename(video_url)
1619 # here's a fun little line of code for you:
1620 video_extension = os.path.splitext(video_id)[1][1:]
1621 video_id = os.path.splitext(video_id)[0]
1623 # it's tempting to parse this further, but you would
1624 # have to take into account all the variations like
1625 # Video Title - Site Name
1626 # Site Name | Video Title
1627 # Video Title - Tagline | Site Name
1628 # and so on and so forth; it's just not practical
1629 mobj = re.search(r'<title>(.*)</title>', webpage)
1631 self._downloader.trouble(u'ERROR: unable to extract title')
1633 video_title = mobj.group(1).decode('utf-8')
1634 video_title = sanitize_title(video_title)
1635 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1637 # video uploader is domain name
1638 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1640 self._downloader.trouble(u'ERROR: unable to extract title')
1642 video_uploader = mobj.group(1).decode('utf-8')
1645 # Process video information
1646 self._downloader.process_info({
1647 'id': video_id.decode('utf-8'),
1648 'url': video_url.decode('utf-8'),
1649 'uploader': video_uploader,
1650 'uploaddate': u'NA',
1651 'title': video_title,
1652 'stitle': simple_title,
1653 'ext': video_extension.decode('utf-8'),
1657 except UnavailableVideoError, err:
1658 self._downloader.trouble(u'ERROR: unable to download video')
1661 class YoutubeSearchIE(InfoExtractor):
1662 """Information Extractor for YouTube search queries."""
1663 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1664 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1665 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1666 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1668 _max_youtube_results = 1000
1670 def __init__(self, youtube_ie, downloader=None):
1671 InfoExtractor.__init__(self, downloader)
1672 self._youtube_ie = youtube_ie
1676 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1678 def report_download_page(self, query, pagenum):
1679 """Report attempt to download playlist page with given number."""
1680 query = query.decode(preferredencoding())
1681 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1683 def _real_initialize(self):
1684 self._youtube_ie.initialize()
1686 def _real_extract(self, query):
1687 mobj = re.match(self._VALID_QUERY, query)
1689 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1692 prefix, query = query.split(':')
1694 query = query.encode('utf-8')
1696 self._download_n_results(query, 1)
1698 elif prefix == 'all':
1699 self._download_n_results(query, self._max_youtube_results)
1705 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1707 elif n > self._max_youtube_results:
1708 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1709 n = self._max_youtube_results
1710 self._download_n_results(query, n)
1712 except ValueError: # parsing prefix as integer fails
1713 self._download_n_results(query, 1)
1716 def _download_n_results(self, query, n):
1717 """Downloads a specified number of results for a query"""
1720 already_seen = set()
1724 self.report_download_page(query, pagenum)
1725 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1726 request = urllib2.Request(result_url, None, std_headers)
1728 page = urllib2.urlopen(request).read()
1729 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1730 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1733 # Extract video identifiers
1734 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1735 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1736 if video_id not in already_seen:
1737 video_ids.append(video_id)
1738 already_seen.add(video_id)
1739 if len(video_ids) == n:
1740 # Specified n videos reached
1741 for id in video_ids:
1742 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1745 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1746 for id in video_ids:
1747 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1750 pagenum = pagenum + 1
1752 class GoogleSearchIE(InfoExtractor):
1753 """Information Extractor for Google Video search queries."""
1754 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1755 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1756 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1757 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1759 _max_google_results = 1000
1761 def __init__(self, google_ie, downloader=None):
1762 InfoExtractor.__init__(self, downloader)
1763 self._google_ie = google_ie
1767 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1769 def report_download_page(self, query, pagenum):
1770 """Report attempt to download playlist page with given number."""
1771 query = query.decode(preferredencoding())
1772 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1774 def _real_initialize(self):
1775 self._google_ie.initialize()
1777 def _real_extract(self, query):
1778 mobj = re.match(self._VALID_QUERY, query)
1780 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1783 prefix, query = query.split(':')
1785 query = query.encode('utf-8')
1787 self._download_n_results(query, 1)
1789 elif prefix == 'all':
1790 self._download_n_results(query, self._max_google_results)
1796 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1798 elif n > self._max_google_results:
1799 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1800 n = self._max_google_results
1801 self._download_n_results(query, n)
1803 except ValueError: # parsing prefix as integer fails
1804 self._download_n_results(query, 1)
1807 def _download_n_results(self, query, n):
1808 """Downloads a specified number of results for a query"""
1811 already_seen = set()
1815 self.report_download_page(query, pagenum)
1816 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1817 request = urllib2.Request(result_url, None, std_headers)
1819 page = urllib2.urlopen(request).read()
1820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1824 # Extract video identifiers
1825 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1826 video_id = mobj.group(1)
1827 if video_id not in already_seen:
1828 video_ids.append(video_id)
1829 already_seen.add(video_id)
1830 if len(video_ids) == n:
1831 # Specified n videos reached
1832 for id in video_ids:
1833 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1836 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1837 for id in video_ids:
1838 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1841 pagenum = pagenum + 1
1843 class YahooSearchIE(InfoExtractor):
1844 """Information Extractor for Yahoo! Video search queries."""
1845 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1846 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1847 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1848 _MORE_PAGES_INDICATOR = r'\s*Next'
1850 _max_yahoo_results = 1000
1852 def __init__(self, yahoo_ie, downloader=None):
1853 InfoExtractor.__init__(self, downloader)
1854 self._yahoo_ie = yahoo_ie
1858 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1860 def report_download_page(self, query, pagenum):
1861 """Report attempt to download playlist page with given number."""
1862 query = query.decode(preferredencoding())
1863 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1865 def _real_initialize(self):
1866 self._yahoo_ie.initialize()
1868 def _real_extract(self, query):
1869 mobj = re.match(self._VALID_QUERY, query)
1871 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1874 prefix, query = query.split(':')
1876 query = query.encode('utf-8')
1878 self._download_n_results(query, 1)
1880 elif prefix == 'all':
1881 self._download_n_results(query, self._max_yahoo_results)
1887 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1889 elif n > self._max_yahoo_results:
1890 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1891 n = self._max_yahoo_results
1892 self._download_n_results(query, n)
1894 except ValueError: # parsing prefix as integer fails
1895 self._download_n_results(query, 1)
1898 def _download_n_results(self, query, n):
1899 """Downloads a specified number of results for a query"""
1902 already_seen = set()
1906 self.report_download_page(query, pagenum)
1907 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1908 request = urllib2.Request(result_url, None, std_headers)
1910 page = urllib2.urlopen(request).read()
1911 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1912 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1915 # Extract video identifiers
1916 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917 video_id = mobj.group(1)
1918 if video_id not in already_seen:
1919 video_ids.append(video_id)
1920 already_seen.add(video_id)
1921 if len(video_ids) == n:
1922 # Specified n videos reached
1923 for id in video_ids:
1924 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1927 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1928 for id in video_ids:
1929 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1932 pagenum = pagenum + 1
1934 class YoutubePlaylistIE(InfoExtractor):
1935 """Information Extractor for YouTube playlists."""
1937 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1938 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1939 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1940 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1943 def __init__(self, youtube_ie, downloader=None):
1944 InfoExtractor.__init__(self, downloader)
1945 self._youtube_ie = youtube_ie
1949 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1951 def report_download_page(self, playlist_id, pagenum):
1952 """Report attempt to download playlist page with given number."""
1953 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1955 def _real_initialize(self):
1956 self._youtube_ie.initialize()
1958 def _real_extract(self, url):
1959 # Extract playlist id
1960 mobj = re.match(self._VALID_URL, url)
1962 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1965 # Download playlist pages
1966 playlist_id = mobj.group(1)
1971 self.report_download_page(playlist_id, pagenum)
1972 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1974 page = urllib2.urlopen(request).read()
1975 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1976 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1979 # Extract video identifiers
1981 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1982 if mobj.group(1) not in ids_in_page:
1983 ids_in_page.append(mobj.group(1))
1984 video_ids.extend(ids_in_page)
1986 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1988 pagenum = pagenum + 1
1990 playliststart = self._downloader.params.get('playliststart', 1) - 1
1991 playlistend = self._downloader.params.get('playlistend', -1)
1992 video_ids = video_ids[playliststart:playlistend]
1994 for id in video_ids:
1995 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1998 class YoutubeUserIE(InfoExtractor):
1999 """Information Extractor for YouTube users."""
2001 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2002 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2003 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2006 def __init__(self, youtube_ie, downloader=None):
2007 InfoExtractor.__init__(self, downloader)
2008 self._youtube_ie = youtube_ie
2012 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2014 def report_download_page(self, username):
2015 """Report attempt to download user page."""
2016 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2018 def _real_initialize(self):
2019 self._youtube_ie.initialize()
2021 def _real_extract(self, url):
2023 mobj = re.match(self._VALID_URL, url)
2025 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2028 # Download user page
2029 username = mobj.group(1)
2033 self.report_download_page(username)
2034 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2036 page = urllib2.urlopen(request).read()
2037 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2038 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2041 # Extract video identifiers
2044 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2045 if mobj.group(1) not in ids_in_page:
2046 ids_in_page.append(mobj.group(1))
2047 video_ids.extend(ids_in_page)
2049 playliststart = self._downloader.params.get('playliststart', 1) - 1
2050 playlistend = self._downloader.params.get('playlistend', -1)
2051 video_ids = video_ids[playliststart:playlistend]
2053 for id in video_ids:
2054 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2057 class PostProcessor(object):
2058 """Post Processor class.
2060 PostProcessor objects can be added to downloaders with their
2061 add_post_processor() method. When the downloader has finished a
2062 successful download, it will take its internal chain of PostProcessors
2063 and start calling the run() method on each one of them, first with
2064 an initial argument and then with the returned value of the previous
2067 The chain will be stopped if one of them ever returns None or the end
2068 of the chain is reached.
2070 PostProcessor objects follow a "mutual registration" process similar
2071 to InfoExtractor objects.
2076 def __init__(self, downloader=None):
2077 self._downloader = downloader
2079 def set_downloader(self, downloader):
2080 """Sets the downloader for this PP."""
2081 self._downloader = downloader
2083 def run(self, information):
2084 """Run the PostProcessor.
2086 The "information" argument is a dictionary like the ones
2087 composed by InfoExtractors. The only difference is that this
2088 one has an extra field called "filepath" that points to the
2091 When this method returns None, the postprocessing chain is
2092 stopped. However, this method may return an information
2093 dictionary that will be passed to the next postprocessing
2094 object in the chain. It can be the one it received after
2095 changing some fields.
2097 In addition, this method may raise a PostProcessingError
2098 exception that will be taken into account by the downloader
2101 return information # by default, do nothing
2103 ### MAIN PROGRAM ###
2104 if __name__ == '__main__':
2106 # Modules needed only when running the main program
2110 # Function to update the program file with the latest version from bitbucket.org
2111 def update_self(downloader, filename):
2112 # Note: downloader only used for options
2113 if not os.access (filename, os.W_OK):
2114 sys.exit('ERROR: no write permissions on %s' % filename)
2116 downloader.to_screen('Updating to latest stable version...')
2117 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2118 latest_version = urllib.urlopen(latest_url).read().strip()
2119 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2120 newcontent = urllib.urlopen(prog_url).read()
2121 stream = open(filename, 'w')
2122 stream.write(newcontent)
2124 downloader.to_screen('Updated to version %s' % latest_version)
2126 # Parse command line
2127 parser = optparse.OptionParser(
2128 usage='Usage: %prog [options] url...',
2129 version='2010.10.24',
2130 conflict_handler='resolve',
2133 parser.add_option('-h', '--help',
2134 action='help', help='print this help text and exit')
2135 parser.add_option('-v', '--version',
2136 action='version', help='print program version and exit')
2137 parser.add_option('-U', '--update',
2138 action='store_true', dest='update_self', help='update this program to latest stable version')
2139 parser.add_option('-i', '--ignore-errors',
2140 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2141 parser.add_option('-r', '--rate-limit',
2142 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2143 parser.add_option('-R', '--retries',
2144 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2145 parser.add_option('--playlist-start',
2146 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2147 parser.add_option('--playlist-end',
2148 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2150 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2151 authentication.add_option('-u', '--username',
2152 dest='username', metavar='USERNAME', help='account username')
2153 authentication.add_option('-p', '--password',
2154 dest='password', metavar='PASSWORD', help='account password')
2155 authentication.add_option('-n', '--netrc',
2156 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2157 parser.add_option_group(authentication)
2159 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2160 video_format.add_option('-f', '--format',
2161 action='store', dest='format', metavar='FORMAT', help='video format code')
2162 video_format.add_option('-m', '--mobile-version',
2163 action='store_const', dest='format', help='alias for -f 17', const='17')
2164 video_format.add_option('--all-formats',
2165 action='store_const', dest='format', help='download all available video formats', const='-1')
2166 video_format.add_option('--max-quality',
2167 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2168 video_format.add_option('-b', '--best-quality',
2169 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2170 parser.add_option_group(video_format)
2172 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2173 verbosity.add_option('-q', '--quiet',
2174 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2175 verbosity.add_option('-s', '--simulate',
2176 action='store_true', dest='simulate', help='do not download video', default=False)
2177 verbosity.add_option('-g', '--get-url',
2178 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2179 verbosity.add_option('-e', '--get-title',
2180 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2181 verbosity.add_option('--get-thumbnail',
2182 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2183 verbosity.add_option('--get-description',
2184 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2185 verbosity.add_option('--no-progress',
2186 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2187 parser.add_option_group(verbosity)
2189 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2190 filesystem.add_option('-t', '--title',
2191 action='store_true', dest='usetitle', help='use title in file name', default=False)
2192 filesystem.add_option('-l', '--literal',
2193 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2194 filesystem.add_option('-A', '--auto-number',
2195 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2196 filesystem.add_option('-o', '--output',
2197 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2198 filesystem.add_option('-a', '--batch-file',
2199 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2200 filesystem.add_option('-w', '--no-overwrites',
2201 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2202 filesystem.add_option('-c', '--continue',
2203 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2204 filesystem.add_option('--cookies',
2205 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2206 parser.add_option_group(filesystem)
2208 (opts, args) = parser.parse_args()
2210 # Open appropriate CookieJar
2211 if opts.cookiefile is None:
2212 jar = cookielib.CookieJar()
2215 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2216 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2218 except (IOError, OSError), err:
2219 sys.exit(u'ERROR: unable to open cookie file')
2221 # General configuration
2222 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2223 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2224 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2225 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2227 # Batch file verification
2229 if opts.batchfile is not None:
2231 if opts.batchfile == '-':
2234 batchfd = open(opts.batchfile, 'r')
2235 batchurls = batchfd.readlines()
2236 batchurls = [x.strip() for x in batchurls]
2237 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2239 sys.exit(u'ERROR: batch file could not be read')
2240 all_urls = batchurls + args
2242 # Conflicting, missing and erroneous options
2243 if opts.bestquality:
2244 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2245 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2246 parser.error(u'using .netrc conflicts with giving username/password')
2247 if opts.password is not None and opts.username is None:
2248 parser.error(u'account username missing')
2249 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2250 parser.error(u'using output template conflicts with using title, literal title or auto number')
2251 if opts.usetitle and opts.useliteral:
2252 parser.error(u'using title conflicts with using literal title')
2253 if opts.username is not None and opts.password is None:
2254 opts.password = getpass.getpass(u'Type account password and press return:')
2255 if opts.ratelimit is not None:
2256 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2257 if numeric_limit is None:
2258 parser.error(u'invalid rate limit specified')
2259 opts.ratelimit = numeric_limit
2260 if opts.retries is not None:
2262 opts.retries = long(opts.retries)
2263 except (TypeError, ValueError), err:
2264 parser.error(u'invalid retry count specified')
2266 opts.playliststart = long(opts.playliststart)
2267 if opts.playliststart <= 0:
2269 except (TypeError, ValueError), err:
2270 parser.error(u'invalid playlist start number specified')
2272 opts.playlistend = long(opts.playlistend)
2273 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2275 except (TypeError, ValueError), err:
2276 parser.error(u'invalid playlist end number specified')
2278 # Information extractors
2279 youtube_ie = YoutubeIE()
2280 metacafe_ie = MetacafeIE(youtube_ie)
2281 dailymotion_ie = DailymotionIE()
2282 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2283 youtube_user_ie = YoutubeUserIE(youtube_ie)
2284 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2285 google_ie = GoogleIE()
2286 google_search_ie = GoogleSearchIE(google_ie)
2287 photobucket_ie = PhotobucketIE()
2288 yahoo_ie = YahooIE()
2289 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2290 generic_ie = GenericIE()
2293 fd = FileDownloader({
2294 'usenetrc': opts.usenetrc,
2295 'username': opts.username,
2296 'password': opts.password,
2297 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2298 'forceurl': opts.geturl,
2299 'forcetitle': opts.gettitle,
2300 'forcethumbnail': opts.getthumbnail,
2301 'forcedescription': opts.getdescription,
2302 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2303 'format': opts.format,
2304 'format_limit': opts.format_limit,
2305 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2306 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2307 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2308 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2309 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2310 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2311 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2312 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2313 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2314 or u'%(id)s.%(ext)s'),
2315 'ignoreerrors': opts.ignoreerrors,
2316 'ratelimit': opts.ratelimit,
2317 'nooverwrites': opts.nooverwrites,
2318 'retries': opts.retries,
2319 'continuedl': opts.continue_dl,
2320 'noprogress': opts.noprogress,
2321 'playliststart': opts.playliststart,
2322 'playlistend': opts.playlistend,
2323 'logtostderr': opts.outtmpl == '-',
2325 fd.add_info_extractor(youtube_search_ie)
2326 fd.add_info_extractor(youtube_pl_ie)
2327 fd.add_info_extractor(youtube_user_ie)
2328 fd.add_info_extractor(metacafe_ie)
2329 fd.add_info_extractor(dailymotion_ie)
2330 fd.add_info_extractor(youtube_ie)
2331 fd.add_info_extractor(google_ie)
2332 fd.add_info_extractor(google_search_ie)
2333 fd.add_info_extractor(photobucket_ie)
2334 fd.add_info_extractor(yahoo_ie)
2335 fd.add_info_extractor(yahoo_search_ie)
2337 # This must come last since it's the
2338 # fallback if none of the others work
2339 fd.add_info_extractor(generic_ie)
2342 if opts.update_self:
2343 update_self(fd, sys.argv[0])
2346 if len(all_urls) < 1:
2347 if not opts.update_self:
2348 parser.error(u'you must provide at least one URL')
2351 retcode = fd.download(all_urls)
2353 # Dump cookie jar if requested
2354 if opts.cookiefile is not None:
2357 except (IOError, OSError), err:
2358 sys.exit(u'ERROR: unable to save cookie jar')
2362 except DownloadError:
2364 except SameFileError:
2365 sys.exit(u'ERROR: fixed output name but more than one file to download')
2366 except KeyboardInterrupt:
2367 sys.exit(u'\nERROR: Interrupted by user')