2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
24 # parse_qs was moved from the cgi module to the urlparse module recently.
26 from urlparse import parse_qs
28 from cgi import parse_qs
31 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
32 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
33 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
34 'Accept-Language': 'en-us,en;q=0.5',
37 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
39 def preferredencoding():
40 """Get preferred encoding.
42 Returns the best encoding scheme for the system, based on
43 locale.getpreferredencoding() and some further tweaks.
45 def yield_preferredencoding():
47 pref = locale.getpreferredencoding()
53 return yield_preferredencoding().next()
55 def htmlentity_transform(matchobj):
56 """Transforms an HTML entity to a Unicode character.
58 This function receives a match object and is intended to be used with
59 the re.sub() function.
61 entity = matchobj.group(1)
63 # Known non-numeric HTML entity
64 if entity in htmlentitydefs.name2codepoint:
65 return unichr(htmlentitydefs.name2codepoint[entity])
68 mobj = re.match(ur'(?u)#(x?\d+)', entity)
70 numstr = mobj.group(1)
71 if numstr.startswith(u'x'):
73 numstr = u'0%s' % numstr
76 return unichr(long(numstr, base))
78 # Unknown entity in name, return its literal representation
79 return (u'&%s;' % entity)
81 def sanitize_title(utitle):
82 """Sanitizes a video title so it could be used as part of a filename."""
83 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
84 return utitle.replace(unicode(os.sep), u'%')
86 def sanitize_open(filename, open_mode):
87 """Try to open the given filename, and slightly tweak it if this fails.
89 Attempts to open the given filename. If this fails, it tries to change
90 the filename slightly, step by step, until it's either able to open it
91 or it fails and raises a final exception, like the standard open()
94 It returns the tuple (stream, definitive_file_name).
98 if sys.platform == 'win32':
100 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
101 return (sys.stdout, filename)
102 stream = open(filename, open_mode)
103 return (stream, filename)
104 except (IOError, OSError), err:
105 # In case of error, try to remove win32 forbidden chars
106 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
108 # An exception here should be caught in the caller
109 stream = open(filename, open_mode)
110 return (stream, filename)
113 class DownloadError(Exception):
114 """Download Error exception.
116 This exception may be thrown by FileDownloader objects if they are not
117 configured to continue on errors. They will contain the appropriate
122 class SameFileError(Exception):
123 """Same File exception.
125 This exception will be thrown by FileDownloader objects if they detect
126 multiple files would have to be downloaded to the same file on disk.
130 class PostProcessingError(Exception):
131 """Post Processing exception.
133 This exception may be raised by PostProcessor's .run() method to
134 indicate an error in the postprocessing task.
138 class UnavailableVideoError(Exception):
139 """Unavailable Format exception.
141 This exception will be thrown when a video is requested
142 in a format that is not available for that video.
146 class ContentTooShortError(Exception):
147 """Content Too Short exception.
149 This exception may be raised by FileDownloader objects when a file they
150 download is too small for what the server announced first, indicating
151 the connection was probably interrupted.
157 def __init__(self, downloaded, expected):
158 self.downloaded = downloaded
159 self.expected = expected
161 class FileDownloader(object):
162 """File Downloader class.
164 File downloader objects are the ones responsible of downloading the
165 actual video file and writing it to disk if the user has requested
166 it, among some other tasks. In most cases there should be one per
167 program. As, given a video URL, the downloader doesn't know how to
168 extract all the needed information, task that InfoExtractors do, it
169 has to pass the URL to one of them.
171 For this, file downloader objects have a method that allows
172 InfoExtractors to be registered in a given order. When it is passed
173 a URL, the file downloader handles it to the first InfoExtractor it
174 finds that reports being able to handle it. The InfoExtractor extracts
175 all the information about the video or videos the URL refers to, and
176 asks the FileDownloader to process the video information, possibly
177 downloading the video.
179 File downloaders accept a lot of parameters. In order not to saturate
180 the object constructor with arguments, it receives a dictionary of
181 options instead. These options are available through the params
182 attribute for the InfoExtractors to use. The FileDownloader also
183 registers itself as the downloader in charge for the InfoExtractors
184 that are added to it, so this is a "mutual registration".
188 username: Username for authentication purposes.
189 password: Password for authentication purposes.
190 usenetrc: Use netrc for authentication instead.
191 quiet: Do not print messages to stdout.
192 forceurl: Force printing final URL.
193 forcetitle: Force printing title.
194 forcethumbnail: Force printing thumbnail URL.
195 forcedescription: Force printing description.
196 simulate: Do not download the video files.
197 format: Video format code.
198 format_limit: Highest quality format to try.
199 outtmpl: Template for output names.
200 ignoreerrors: Do not stop on download errors.
201 ratelimit: Download speed limit, in bytes/sec.
202 nooverwrites: Prevent overwriting files.
203 retries: Number of times to retry for HTTP error 5xx
204 continuedl: Try to continue downloads if possible.
205 noprogress: Do not print the progress bar.
206 playliststart: Playlist item to start at.
212 _download_retcode = None
213 _num_downloads = None
215 def __init__(self, params):
216 """Create a FileDownloader object with the given options."""
219 self._download_retcode = 0
220 self._num_downloads = 0
224 def pmkdir(filename):
225 """Create directory components in filename. Similar to Unix "mkdir -p"."""
226 components = filename.split(os.sep)
227 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
228 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
229 for dir in aggregate:
230 if not os.path.exists(dir):
234 def format_bytes(bytes):
237 if type(bytes) is str:
242 exponent = long(math.log(bytes, 1024.0))
243 suffix = 'bkMGTPEZY'[exponent]
244 converted = float(bytes) / float(1024**exponent)
245 return '%.2f%s' % (converted, suffix)
248 def calc_percent(byte_counter, data_len):
251 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
254 def calc_eta(start, now, total, current):
258 if current == 0 or dif < 0.001: # One millisecond
260 rate = float(current) / dif
261 eta = long((float(total) - float(current)) / rate)
262 (eta_mins, eta_secs) = divmod(eta, 60)
265 return '%02d:%02d' % (eta_mins, eta_secs)
268 def calc_speed(start, now, bytes):
270 if bytes == 0 or dif < 0.001: # One millisecond
271 return '%10s' % '---b/s'
272 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
275 def best_block_size(elapsed_time, bytes):
276 new_min = max(bytes / 2.0, 1.0)
277 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
278 if elapsed_time < 0.001:
280 rate = bytes / elapsed_time
288 def parse_bytes(bytestr):
289 """Parse a string indicating a byte quantity into a long integer."""
290 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
293 number = float(matchobj.group(1))
294 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
295 return long(round(number * multiplier))
297 def add_info_extractor(self, ie):
298 """Add an InfoExtractor object to the end of the list."""
300 ie.set_downloader(self)
302 def add_post_processor(self, pp):
303 """Add a PostProcessor object to the end of the chain."""
305 pp.set_downloader(self)
307 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
308 """Print message to stdout if not in quiet mode."""
310 if not self.params.get('quiet', False):
311 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
313 except (UnicodeEncodeError), err:
314 if not ignore_encoding_errors:
317 def to_stderr(self, message):
318 """Print message to stderr."""
319 print >>sys.stderr, message.encode(preferredencoding())
321 def fixed_template(self):
322 """Checks if the output template is fixed."""
323 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
325 def trouble(self, message=None):
326 """Determine action to take when a download problem appears.
328 Depending on if the downloader has been configured to ignore
329 download errors or not, this method may throw an exception or
330 not when errors are found, after printing the message.
332 if message is not None:
333 self.to_stderr(message)
334 if not self.params.get('ignoreerrors', False):
335 raise DownloadError(message)
336 self._download_retcode = 1
338 def slow_down(self, start_time, byte_counter):
339 """Sleep if the download speed is over the rate limit."""
340 rate_limit = self.params.get('ratelimit', None)
341 if rate_limit is None or byte_counter == 0:
344 elapsed = now - start_time
347 speed = float(byte_counter) / elapsed
348 if speed > rate_limit:
349 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
351 def report_destination(self, filename):
352 """Report destination filename."""
353 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
355 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
356 """Report download progress."""
357 if self.params.get('noprogress', False):
359 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
360 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
362 def report_resuming_byte(self, resume_len):
363 """Report attempt to resume at given byte."""
364 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
366 def report_retry(self, count, retries):
367 """Report retry in case of HTTP error 5xx"""
368 self.to_stdout(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
370 def report_file_already_downloaded(self, file_name):
371 """Report file has already been fully downloaded."""
373 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
374 except (UnicodeEncodeError), err:
375 self.to_stdout(u'[download] The file has already been downloaded')
377 def report_unable_to_resume(self):
378 """Report it was impossible to resume download."""
379 self.to_stdout(u'[download] Unable to resume')
381 def report_finish(self):
382 """Report download finished."""
383 if self.params.get('noprogress', False):
384 self.to_stdout(u'[download] Download completed')
388 def increment_downloads(self):
389 """Increment the ordinal that assigns a number to each file."""
390 self._num_downloads += 1
392 def process_info(self, info_dict):
393 """Process a single dictionary returned by an InfoExtractor."""
394 # Do nothing else if in simulate mode
395 if self.params.get('simulate', False):
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
402 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
403 if self.params.get('forcedescription', False) and 'description' in info_dict:
404 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
409 template_dict = dict(info_dict)
410 template_dict['epoch'] = unicode(long(time.time()))
411 template_dict['ord'] = unicode('%05d' % self._num_downloads)
412 filename = self.params['outtmpl'] % template_dict
413 except (ValueError, KeyError), err:
414 self.trouble(u'ERROR: invalid system charset or erroneous output template')
416 if self.params.get('nooverwrites', False) and os.path.exists(filename):
417 self.to_stderr(u'WARNING: file exists and will be skipped')
421 self.pmkdir(filename)
422 except (OSError, IOError), err:
423 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
427 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
428 except (OSError, IOError), err:
429 raise UnavailableVideoError
430 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
431 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
433 except (ContentTooShortError, ), err:
434 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
439 self.post_process(filename, info_dict)
440 except (PostProcessingError), err:
441 self.trouble(u'ERROR: postprocessing: %s' % str(err))
444 def download(self, url_list):
445 """Download a given list of URLs."""
446 if len(url_list) > 1 and self.fixed_template():
447 raise SameFileError(self.params['outtmpl'])
450 suitable_found = False
452 # Go to next InfoExtractor if not suitable
453 if not ie.suitable(url):
456 # Suitable InfoExtractor found
457 suitable_found = True
459 # Extract information from URL and process it
462 # Suitable InfoExtractor had been found; go to next URL
465 if not suitable_found:
466 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
468 return self._download_retcode
470 def post_process(self, filename, ie_info):
471 """Run the postprocessing chain on the given file."""
473 info['filepath'] = filename
479 def _download_with_rtmpdump(self, filename, url, player_url):
480 self.report_destination(filename)
482 # Check for rtmpdump first
484 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
485 except (OSError, IOError):
486 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
489 # Download using rtmpdump. rtmpdump returns exit code 2 when
490 # the connection was interrumpted and resuming appears to be
491 # possible. This is part of rtmpdump's normal usage, AFAIK.
492 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
493 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
494 while retval == 2 or retval == 1:
495 prevsize = os.path.getsize(filename)
496 self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
497 time.sleep(5.0) # This seems to be needed
498 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
499 cursize = os.path.getsize(filename)
500 if prevsize == cursize and retval == 1:
503 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
506 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
509 def _do_download(self, filename, url, player_url):
510 # Attempt to download using rtmpdump
511 if url.startswith('rtmp'):
512 return self._download_with_rtmpdump(filename, url, player_url)
516 basic_request = urllib2.Request(url, None, std_headers)
517 request = urllib2.Request(url, None, std_headers)
519 # Establish possible resume length
520 if os.path.isfile(filename):
521 resume_len = os.path.getsize(filename)
525 # Request parameters in case of being able to resume
526 if self.params.get('continuedl', False) and resume_len != 0:
527 self.report_resuming_byte(resume_len)
528 request.add_header('Range','bytes=%d-' % resume_len)
532 retries = self.params.get('retries', 0)
533 while count <= retries:
534 # Establish connection
536 data = urllib2.urlopen(request)
538 except (urllib2.HTTPError, ), err:
539 if (err.code < 500 or err.code >= 600) and err.code != 416:
540 # Unexpected HTTP error
542 elif err.code == 416:
543 # Unable to resume (requested range not satisfiable)
545 # Open the connection again without the range header
546 data = urllib2.urlopen(basic_request)
547 content_length = data.info()['Content-Length']
548 except (urllib2.HTTPError, ), err:
549 if err.code < 500 or err.code >= 600:
552 # Examine the reported length
553 if (content_length is not None and
554 (resume_len - 100 < long(content_length) < resume_len + 100)):
555 # The file had already been fully downloaded.
556 # Explanation to the above condition: in issue #175 it was revealed that
557 # YouTube sometimes adds or removes a few bytes from the end of the file,
558 # changing the file size slightly and causing problems for some users. So
559 # I decided to implement a suggested change and consider the file
560 # completely downloaded if the file size differs less than 100 bytes from
561 # the one in the hard drive.
562 self.report_file_already_downloaded(filename)
565 # The length does not match, we start the download over
566 self.report_unable_to_resume()
572 self.report_retry(count, retries)
575 self.trouble(u'ERROR: giving up after %s retries' % retries)
578 data_len = data.info().get('Content-length', None)
579 data_len_str = self.format_bytes(data_len)
586 data_block = data.read(block_size)
588 data_block_len = len(data_block)
589 if data_block_len == 0:
591 byte_counter += data_block_len
593 # Open file just in time
596 (stream, filename) = sanitize_open(filename, open_mode)
597 self.report_destination(filename)
598 except (OSError, IOError), err:
599 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
602 stream.write(data_block)
603 except (IOError, OSError), err:
604 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
606 block_size = self.best_block_size(after - before, data_block_len)
609 percent_str = self.calc_percent(byte_counter, data_len)
610 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
611 speed_str = self.calc_speed(start, time.time(), byte_counter)
612 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
615 self.slow_down(start, byte_counter)
618 if data_len is not None and str(byte_counter) != data_len:
619 raise ContentTooShortError(byte_counter, long(data_len))
622 class InfoExtractor(object):
623 """Information Extractor class.
625 Information extractors are the classes that, given a URL, extract
626 information from the video (or videos) the URL refers to. This
627 information includes the real video URL, the video title and simplified
628 title, author and others. The information is stored in a dictionary
629 which is then passed to the FileDownloader. The FileDownloader
630 processes this information possibly downloading the video to the file
631 system, among other possible outcomes. The dictionaries must include
632 the following fields:
634 id: Video identifier.
635 url: Final video URL.
636 uploader: Nickname of the video uploader.
637 title: Literal title.
638 stitle: Simplified title.
639 ext: Video filename extension.
640 format: Video format.
641 player_url: SWF Player URL (may be None).
643 The following fields are optional. Their primary purpose is to allow
644 youtube-dl to serve as the backend for a video search function, such
645 as the one in youtube2mp3. They are only used when their respective
646 forced printing functions are called:
648 thumbnail: Full URL to a video thumbnail image.
649 description: One-line video description.
651 Subclasses of this one should re-define the _real_initialize() and
652 _real_extract() methods, as well as the suitable() static method.
653 Probably, they should also be instantiated and added to the main
660 def __init__(self, downloader=None):
661 """Constructor. Receives an optional downloader."""
663 self.set_downloader(downloader)
667 """Receives a URL and returns True if suitable for this IE."""
670 def initialize(self):
671 """Initializes an instance (authentication, etc)."""
673 self._real_initialize()
676 def extract(self, url):
677 """Extracts URL information and returns it in list of dicts."""
679 return self._real_extract(url)
681 def set_downloader(self, downloader):
682 """Sets the downloader for this IE."""
683 self._downloader = downloader
685 def _real_initialize(self):
686 """Real initialization process. Redefine in subclasses."""
689 def _real_extract(self, url):
690 """Real extraction process. Redefine in subclasses."""
693 class YoutubeIE(InfoExtractor):
694 """Information extractor for youtube.com."""
696 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
697 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
698 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
699 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
700 _NETRC_MACHINE = 'youtube'
701 # Listed in order of quality
702 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
703 _video_extensions = {
709 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
716 return (re.match(YoutubeIE._VALID_URL, url) is not None)
718 def report_lang(self):
719 """Report attempt to set language."""
720 self._downloader.to_stdout(u'[youtube] Setting language')
722 def report_login(self):
723 """Report attempt to log in."""
724 self._downloader.to_stdout(u'[youtube] Logging in')
726 def report_age_confirmation(self):
727 """Report attempt to confirm age."""
728 self._downloader.to_stdout(u'[youtube] Confirming age')
730 def report_video_webpage_download(self, video_id):
731 """Report attempt to download video webpage."""
732 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
734 def report_video_info_webpage_download(self, video_id):
735 """Report attempt to download video info webpage."""
736 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
738 def report_information_extraction(self, video_id):
739 """Report attempt to extract video information."""
740 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
742 def report_unavailable_format(self, video_id, format):
743 """Report extracted video URL."""
744 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
746 def report_rtmp_download(self):
747 """Indicate the download will use the RTMP protocol."""
748 self._downloader.to_stdout(u'[youtube] RTMP download detected')
750 def _real_initialize(self):
751 if self._downloader is None:
756 downloader_params = self._downloader.params
758 # Attempt to use provided username and password or .netrc data
759 if downloader_params.get('username', None) is not None:
760 username = downloader_params['username']
761 password = downloader_params['password']
762 elif downloader_params.get('usenetrc', False):
764 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
769 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
770 except (IOError, netrc.NetrcParseError), err:
771 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
775 request = urllib2.Request(self._LANG_URL, None, std_headers)
778 urllib2.urlopen(request).read()
779 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
780 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
783 # No authentication to be performed
789 'current_form': 'loginForm',
791 'action_login': 'Log In',
792 'username': username,
793 'password': password,
795 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
798 login_results = urllib2.urlopen(request).read()
799 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
800 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
802 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
803 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
809 'action_confirm': 'Confirm',
811 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
813 self.report_age_confirmation()
814 age_results = urllib2.urlopen(request).read()
815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
819 def _real_extract(self, url):
820 # Extract video id from URL
821 mobj = re.match(self._VALID_URL, url)
823 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
825 video_id = mobj.group(2)
828 self.report_video_webpage_download(video_id)
829 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
831 video_webpage = urllib2.urlopen(request).read()
832 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
833 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
836 # Attempt to extract SWF player URL
837 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
839 player_url = mobj.group(1)
844 self.report_video_info_webpage_download(video_id)
845 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
846 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
847 % (video_id, el_type))
848 request = urllib2.Request(video_info_url, None, std_headers)
850 video_info_webpage = urllib2.urlopen(request).read()
851 video_info = parse_qs(video_info_webpage)
852 if 'token' in video_info:
854 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
855 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
857 if 'token' not in video_info:
858 if 'reason' in video_info:
859 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
861 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
864 # Start extracting information
865 self.report_information_extraction(video_id)
868 if 'author' not in video_info:
869 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
871 video_uploader = urllib.unquote_plus(video_info['author'][0])
874 if 'title' not in video_info:
875 self._downloader.trouble(u'ERROR: unable to extract video title')
877 video_title = urllib.unquote_plus(video_info['title'][0])
878 video_title = video_title.decode('utf-8')
879 video_title = sanitize_title(video_title)
882 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
883 simple_title = simple_title.strip(ur'_')
886 if 'thumbnail_url' not in video_info:
887 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
889 else: # don't panic if we can't find it
890 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
893 video_description = 'No description available.'
894 if self._downloader.params.get('forcedescription', False):
895 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
897 video_description = mobj.group(1)
900 video_token = urllib.unquote_plus(video_info['token'][0])
902 # Decide which formats to download
903 requested_format = self._downloader.params.get('format', None)
904 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
906 if 'fmt_url_map' in video_info:
907 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
908 format_limit = self._downloader.params.get('format_limit', None)
909 if format_limit is not None and format_limit in self._available_formats:
910 format_list = self._available_formats[self._available_formats.index(format_limit):]
912 format_list = self._available_formats
913 existing_formats = [x for x in format_list if x in url_map]
914 if len(existing_formats) == 0:
915 self._downloader.trouble(u'ERROR: no known formats available for video')
917 if requested_format is None:
918 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
919 elif requested_format == '-1':
920 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
922 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
924 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
925 self.report_rtmp_download()
926 video_url_list = [(None, video_info['conn'][0])]
929 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
932 for format_param, video_real_url in video_url_list:
933 # At this point we have a new video
934 self._downloader.increment_downloads()
937 video_extension = self._video_extensions.get(format_param, 'flv')
939 # Find the video URL in fmt_url_map or conn paramters
941 # Process video information
942 self._downloader.process_info({
943 'id': video_id.decode('utf-8'),
944 'url': video_real_url.decode('utf-8'),
945 'uploader': video_uploader.decode('utf-8'),
946 'title': video_title,
947 'stitle': simple_title,
948 'ext': video_extension.decode('utf-8'),
949 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
950 'thumbnail': video_thumbnail.decode('utf-8'),
951 'description': video_description.decode('utf-8'),
952 'player_url': player_url,
954 except UnavailableVideoError, err:
955 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
958 class MetacafeIE(InfoExtractor):
959 """Information Extractor for metacafe.com."""
961 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
962 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
963 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
966 def __init__(self, youtube_ie, downloader=None):
967 InfoExtractor.__init__(self, downloader)
968 self._youtube_ie = youtube_ie
972 return (re.match(MetacafeIE._VALID_URL, url) is not None)
974 def report_disclaimer(self):
975 """Report disclaimer retrieval."""
976 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
978 def report_age_confirmation(self):
979 """Report attempt to confirm age."""
980 self._downloader.to_stdout(u'[metacafe] Confirming age')
982 def report_download_webpage(self, video_id):
983 """Report webpage download."""
984 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
986 def report_extraction(self, video_id):
987 """Report information extraction."""
988 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
990 def _real_initialize(self):
991 # Retrieve disclaimer
992 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
994 self.report_disclaimer()
995 disclaimer = urllib2.urlopen(request).read()
996 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
997 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1003 'submit': "Continue - I'm over 18",
1005 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007 self.report_age_confirmation()
1008 disclaimer = urllib2.urlopen(request).read()
1009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1013 def _real_extract(self, url):
1014 # Extract id and simplified title from URL
1015 mobj = re.match(self._VALID_URL, url)
1017 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1020 video_id = mobj.group(1)
1022 # Check if video comes from YouTube
1023 mobj2 = re.match(r'^yt-(.*)$', video_id)
1024 if mobj2 is not None:
1025 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1028 # At this point we have a new video
1029 self._downloader.increment_downloads()
1031 simple_title = mobj.group(2).decode('utf-8')
1033 # Retrieve video webpage to extract further information
1034 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1036 self.report_download_webpage(video_id)
1037 webpage = urllib2.urlopen(request).read()
1038 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1039 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1042 # Extract URL, uploader and title from webpage
1043 self.report_extraction(video_id)
1044 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1045 if mobj is not None:
1046 mediaURL = urllib.unquote(mobj.group(1))
1047 video_extension = mediaURL[-3:]
1049 # Extract gdaKey if available
1050 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1052 video_url = mediaURL
1054 gdaKey = mobj.group(1)
1055 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1057 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1059 self._downloader.trouble(u'ERROR: unable to extract media URL')
1061 vardict = parse_qs(mobj.group(1))
1062 if 'mediaData' not in vardict:
1063 self._downloader.trouble(u'ERROR: unable to extract media URL')
1065 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1067 self._downloader.trouble(u'ERROR: unable to extract media URL')
1069 mediaURL = mobj.group(1).replace('\\/', '/')
1070 video_extension = mediaURL[-3:]
1071 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1073 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1075 self._downloader.trouble(u'ERROR: unable to extract title')
1077 video_title = mobj.group(1).decode('utf-8')
1078 video_title = sanitize_title(video_title)
1080 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1082 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1084 video_uploader = mobj.group(1)
1087 # Process video information
1088 self._downloader.process_info({
1089 'id': video_id.decode('utf-8'),
1090 'url': video_url.decode('utf-8'),
1091 'uploader': video_uploader.decode('utf-8'),
1092 'title': video_title,
1093 'stitle': simple_title,
1094 'ext': video_extension.decode('utf-8'),
1098 except UnavailableVideoError:
1099 self._downloader.trouble(u'ERROR: unable to download video')
1102 class DailymotionIE(InfoExtractor):
1103 """Information Extractor for Dailymotion"""
1105 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1107 def __init__(self, downloader=None):
1108 InfoExtractor.__init__(self, downloader)
1112 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1114 def report_download_webpage(self, video_id):
1115 """Report webpage download."""
1116 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1118 def report_extraction(self, video_id):
1119 """Report information extraction."""
1120 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1122 def _real_initialize(self):
1125 def _real_extract(self, url):
1126 # Extract id and simplified title from URL
1127 mobj = re.match(self._VALID_URL, url)
1129 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1132 # At this point we have a new video
1133 self._downloader.increment_downloads()
1134 video_id = mobj.group(1)
1136 simple_title = mobj.group(2).decode('utf-8')
1137 video_extension = 'flv'
1139 # Retrieve video webpage to extract further information
1140 request = urllib2.Request(url)
1142 self.report_download_webpage(video_id)
1143 webpage = urllib2.urlopen(request).read()
1144 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1145 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1148 # Extract URL, uploader and title from webpage
1149 self.report_extraction(video_id)
1150 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1152 self._downloader.trouble(u'ERROR: unable to extract media URL')
1154 mediaURL = urllib.unquote(mobj.group(1))
1156 # if needed add http://www.dailymotion.com/ if relative URL
1158 video_url = mediaURL
1160 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1161 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1163 self._downloader.trouble(u'ERROR: unable to extract title')
1165 video_title = mobj.group(1).decode('utf-8')
1166 video_title = sanitize_title(video_title)
1168 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1170 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1172 video_uploader = mobj.group(1)
1175 # Process video information
1176 self._downloader.process_info({
1177 'id': video_id.decode('utf-8'),
1178 'url': video_url.decode('utf-8'),
1179 'uploader': video_uploader.decode('utf-8'),
1180 'title': video_title,
1181 'stitle': simple_title,
1182 'ext': video_extension.decode('utf-8'),
1186 except UnavailableVideoError:
1187 self._downloader.trouble(u'ERROR: unable to download video')
1189 class GoogleIE(InfoExtractor):
1190 """Information extractor for video.google.com."""
1192 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1194 def __init__(self, downloader=None):
1195 InfoExtractor.__init__(self, downloader)
1199 return (re.match(GoogleIE._VALID_URL, url) is not None)
1201 def report_download_webpage(self, video_id):
1202 """Report webpage download."""
1203 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1205 def report_extraction(self, video_id):
1206 """Report information extraction."""
1207 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1209 def _real_initialize(self):
1212 def _real_extract(self, url):
1213 # Extract id from URL
1214 mobj = re.match(self._VALID_URL, url)
1216 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1219 # At this point we have a new video
1220 self._downloader.increment_downloads()
1221 video_id = mobj.group(1)
1223 video_extension = 'mp4'
1225 # Retrieve video webpage to extract further information
1226 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1228 self.report_download_webpage(video_id)
1229 webpage = urllib2.urlopen(request).read()
1230 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1231 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1234 # Extract URL, uploader, and title from webpage
1235 self.report_extraction(video_id)
1236 mobj = re.search(r"download_url:'([^']+)'", webpage)
1238 video_extension = 'flv'
1239 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1241 self._downloader.trouble(u'ERROR: unable to extract media URL')
1243 mediaURL = urllib.unquote(mobj.group(1))
1244 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1245 mediaURL = mediaURL.replace('\\x26', '\x26')
1247 video_url = mediaURL
1249 mobj = re.search(r'<title>(.*)</title>', webpage)
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1253 video_title = mobj.group(1).decode('utf-8')
1254 video_title = sanitize_title(video_title)
1255 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1257 # Extract video description
1258 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1260 self._downloader.trouble(u'ERROR: unable to extract video description')
1262 video_description = mobj.group(1).decode('utf-8')
1263 if not video_description:
1264 video_description = 'No description available.'
1266 # Extract video thumbnail
1267 if self._downloader.params.get('forcethumbnail', False):
1268 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1270 webpage = urllib2.urlopen(request).read()
1271 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1272 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1274 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1276 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1278 video_thumbnail = mobj.group(1)
1279 else: # we need something to pass to process_info
1280 video_thumbnail = ''
1284 # Process video information
1285 self._downloader.process_info({
1286 'id': video_id.decode('utf-8'),
1287 'url': video_url.decode('utf-8'),
1289 'title': video_title,
1290 'stitle': simple_title,
1291 'ext': video_extension.decode('utf-8'),
1295 except UnavailableVideoError:
1296 self._downloader.trouble(u'ERROR: unable to download video')
1299 class PhotobucketIE(InfoExtractor):
1300 """Information extractor for photobucket.com."""
1302 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1304 def __init__(self, downloader=None):
1305 InfoExtractor.__init__(self, downloader)
1309 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1311 def report_download_webpage(self, video_id):
1312 """Report webpage download."""
1313 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1315 def report_extraction(self, video_id):
1316 """Report information extraction."""
1317 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1319 def _real_initialize(self):
1322 def _real_extract(self, url):
1323 # Extract id from URL
1324 mobj = re.match(self._VALID_URL, url)
1326 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1329 # At this point we have a new video
1330 self._downloader.increment_downloads()
1331 video_id = mobj.group(1)
1333 video_extension = 'flv'
1335 # Retrieve video webpage to extract further information
1336 request = urllib2.Request(url)
1338 self.report_download_webpage(video_id)
1339 webpage = urllib2.urlopen(request).read()
1340 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1341 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1344 # Extract URL, uploader, and title from webpage
1345 self.report_extraction(video_id)
1346 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1348 self._downloader.trouble(u'ERROR: unable to extract media URL')
1350 mediaURL = urllib.unquote(mobj.group(1))
1352 video_url = mediaURL
1354 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1356 self._downloader.trouble(u'ERROR: unable to extract title')
1358 video_title = mobj.group(1).decode('utf-8')
1359 video_title = sanitize_title(video_title)
1360 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1362 video_uploader = mobj.group(2).decode('utf-8')
1365 # Process video information
1366 self._downloader.process_info({
1367 'id': video_id.decode('utf-8'),
1368 'url': video_url.decode('utf-8'),
1369 'uploader': video_uploader,
1370 'title': video_title,
1371 'stitle': simple_title,
1372 'ext': video_extension.decode('utf-8'),
1376 except UnavailableVideoError:
1377 self._downloader.trouble(u'ERROR: unable to download video')
1380 class YahooIE(InfoExtractor):
1381 """Information extractor for video.yahoo.com."""
1383 # _VALID_URL matches all Yahoo! Video URLs
1384 # _VPAGE_URL matches only the extractable '/watch/' URLs
1385 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1386 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1388 def __init__(self, downloader=None):
1389 InfoExtractor.__init__(self, downloader)
1393 return (re.match(YahooIE._VALID_URL, url) is not None)
1395 def report_download_webpage(self, video_id):
1396 """Report webpage download."""
1397 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1399 def report_extraction(self, video_id):
1400 """Report information extraction."""
1401 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1403 def _real_initialize(self):
1406 def _real_extract(self, url, new_video=True):
1407 # Extract ID from URL
1408 mobj = re.match(self._VALID_URL, url)
1410 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1413 # At this point we have a new video
1414 self._downloader.increment_downloads()
1415 video_id = mobj.group(2)
1416 video_extension = 'flv'
1418 # Rewrite valid but non-extractable URLs as
1419 # extractable English language /watch/ URLs
1420 if re.match(self._VPAGE_URL, url) is None:
1421 request = urllib2.Request(url)
1423 webpage = urllib2.urlopen(request).read()
1424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1428 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1430 self._downloader.trouble(u'ERROR: Unable to extract id field')
1432 yahoo_id = mobj.group(1)
1434 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1436 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1438 yahoo_vid = mobj.group(1)
1440 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1441 return self._real_extract(url, new_video=False)
1443 # Retrieve video webpage to extract further information
1444 request = urllib2.Request(url)
1446 self.report_download_webpage(video_id)
1447 webpage = urllib2.urlopen(request).read()
1448 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1449 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1452 # Extract uploader and title from webpage
1453 self.report_extraction(video_id)
1454 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1456 self._downloader.trouble(u'ERROR: unable to extract video title')
1458 video_title = mobj.group(1).decode('utf-8')
1459 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1461 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1463 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1465 video_uploader = mobj.group(1).decode('utf-8')
1467 # Extract video thumbnail
1468 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1470 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1472 video_thumbnail = mobj.group(1).decode('utf-8')
1474 # Extract video description
1475 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1477 self._downloader.trouble(u'ERROR: unable to extract video description')
1479 video_description = mobj.group(1).decode('utf-8')
1480 if not video_description: video_description = 'No description available.'
1482 # Extract video height and width
1483 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1485 self._downloader.trouble(u'ERROR: unable to extract video height')
1487 yv_video_height = mobj.group(1)
1489 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1491 self._downloader.trouble(u'ERROR: unable to extract video width')
1493 yv_video_width = mobj.group(1)
1495 # Retrieve video playlist to extract media URL
1496 # I'm not completely sure what all these options are, but we
1497 # seem to need most of them, otherwise the server sends a 401.
1498 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1499 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1500 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1501 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1502 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1504 self.report_download_webpage(video_id)
1505 webpage = urllib2.urlopen(request).read()
1506 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1507 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1510 # Extract media URL from playlist XML
1511 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1513 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1515 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1516 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1519 # Process video information
1520 self._downloader.process_info({
1521 'id': video_id.decode('utf-8'),
1523 'uploader': video_uploader,
1524 'title': video_title,
1525 'stitle': simple_title,
1526 'ext': video_extension.decode('utf-8'),
1527 'thumbnail': video_thumbnail.decode('utf-8'),
1528 'description': video_description,
1529 'thumbnail': video_thumbnail,
1530 'description': video_description,
1533 except UnavailableVideoError:
1534 self._downloader.trouble(u'ERROR: unable to download video')
1537 class GenericIE(InfoExtractor):
1538 """Generic last-resort information extractor."""
1540 def __init__(self, downloader=None):
1541 InfoExtractor.__init__(self, downloader)
1547 def report_download_webpage(self, video_id):
1548 """Report webpage download."""
1549 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1550 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1552 def report_extraction(self, video_id):
1553 """Report information extraction."""
1554 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1556 def _real_initialize(self):
1559 def _real_extract(self, url):
1560 # At this point we have a new video
1561 self._downloader.increment_downloads()
1563 video_id = url.split('/')[-1]
1564 request = urllib2.Request(url)
1566 self.report_download_webpage(video_id)
1567 webpage = urllib2.urlopen(request).read()
1568 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1569 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1571 except ValueError, err:
1572 # since this is the last-resort InfoExtractor, if
1573 # this error is thrown, it'll be thrown here
1574 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1577 # Start with something easy: JW Player in SWFObject
1578 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1580 # Broaden the search a little bit
1581 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1583 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1586 # It's possible that one of the regexes
1587 # matched, but returned an empty group:
1588 if mobj.group(1) is None:
1589 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1592 video_url = urllib.unquote(mobj.group(1))
1593 video_id = os.path.basename(video_url)
1595 # here's a fun little line of code for you:
1596 video_extension = os.path.splitext(video_id)[1][1:]
1597 video_id = os.path.splitext(video_id)[0]
1599 # it's tempting to parse this further, but you would
1600 # have to take into account all the variations like
1601 # Video Title - Site Name
1602 # Site Name | Video Title
1603 # Video Title - Tagline | Site Name
1604 # and so on and so forth; it's just not practical
1605 mobj = re.search(r'<title>(.*)</title>', webpage)
1607 self._downloader.trouble(u'ERROR: unable to extract title')
1609 video_title = mobj.group(1).decode('utf-8')
1610 video_title = sanitize_title(video_title)
1611 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1613 # video uploader is domain name
1614 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1616 self._downloader.trouble(u'ERROR: unable to extract title')
1618 video_uploader = mobj.group(1).decode('utf-8')
1621 # Process video information
1622 self._downloader.process_info({
1623 'id': video_id.decode('utf-8'),
1624 'url': video_url.decode('utf-8'),
1625 'uploader': video_uploader,
1626 'title': video_title,
1627 'stitle': simple_title,
1628 'ext': video_extension.decode('utf-8'),
1632 except UnavailableVideoError, err:
1633 self._downloader.trouble(u'ERROR: unable to download video')
1636 class YoutubeSearchIE(InfoExtractor):
1637 """Information Extractor for YouTube search queries."""
1638 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1640 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1641 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1643 _max_youtube_results = 1000
1645 def __init__(self, youtube_ie, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1647 self._youtube_ie = youtube_ie
1651 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1653 def report_download_page(self, query, pagenum):
1654 """Report attempt to download playlist page with given number."""
1655 query = query.decode(preferredencoding())
1656 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1658 def _real_initialize(self):
1659 self._youtube_ie.initialize()
1661 def _real_extract(self, query):
1662 mobj = re.match(self._VALID_QUERY, query)
1664 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1667 prefix, query = query.split(':')
1669 query = query.encode('utf-8')
1671 self._download_n_results(query, 1)
1673 elif prefix == 'all':
1674 self._download_n_results(query, self._max_youtube_results)
1680 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1682 elif n > self._max_youtube_results:
1683 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1684 n = self._max_youtube_results
1685 self._download_n_results(query, n)
1687 except ValueError: # parsing prefix as integer fails
1688 self._download_n_results(query, 1)
1691 def _download_n_results(self, query, n):
1692 """Downloads a specified number of results for a query"""
1695 already_seen = set()
1699 self.report_download_page(query, pagenum)
1700 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1701 request = urllib2.Request(result_url, None, std_headers)
1703 page = urllib2.urlopen(request).read()
1704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1705 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1708 # Extract video identifiers
1709 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1710 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1711 if video_id not in already_seen:
1712 video_ids.append(video_id)
1713 already_seen.add(video_id)
1714 if len(video_ids) == n:
1715 # Specified n videos reached
1716 for id in video_ids:
1717 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1720 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1721 for id in video_ids:
1722 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1725 pagenum = pagenum + 1
1727 class GoogleSearchIE(InfoExtractor):
1728 """Information Extractor for Google Video search queries."""
1729 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1730 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1731 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1732 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1734 _max_google_results = 1000
1736 def __init__(self, google_ie, downloader=None):
1737 InfoExtractor.__init__(self, downloader)
1738 self._google_ie = google_ie
1742 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1744 def report_download_page(self, query, pagenum):
1745 """Report attempt to download playlist page with given number."""
1746 query = query.decode(preferredencoding())
1747 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1749 def _real_initialize(self):
1750 self._google_ie.initialize()
1752 def _real_extract(self, query):
1753 mobj = re.match(self._VALID_QUERY, query)
1755 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1758 prefix, query = query.split(':')
1760 query = query.encode('utf-8')
1762 self._download_n_results(query, 1)
1764 elif prefix == 'all':
1765 self._download_n_results(query, self._max_google_results)
1771 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1773 elif n > self._max_google_results:
1774 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1775 n = self._max_google_results
1776 self._download_n_results(query, n)
1778 except ValueError: # parsing prefix as integer fails
1779 self._download_n_results(query, 1)
1782 def _download_n_results(self, query, n):
1783 """Downloads a specified number of results for a query"""
1786 already_seen = set()
1790 self.report_download_page(query, pagenum)
1791 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1792 request = urllib2.Request(result_url, None, std_headers)
1794 page = urllib2.urlopen(request).read()
1795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1796 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1799 # Extract video identifiers
1800 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1801 video_id = mobj.group(1)
1802 if video_id not in already_seen:
1803 video_ids.append(video_id)
1804 already_seen.add(video_id)
1805 if len(video_ids) == n:
1806 # Specified n videos reached
1807 for id in video_ids:
1808 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1811 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1812 for id in video_ids:
1813 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1816 pagenum = pagenum + 1
1818 class YahooSearchIE(InfoExtractor):
1819 """Information Extractor for Yahoo! Video search queries."""
1820 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1821 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1822 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1823 _MORE_PAGES_INDICATOR = r'\s*Next'
1825 _max_yahoo_results = 1000
1827 def __init__(self, yahoo_ie, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1829 self._yahoo_ie = yahoo_ie
1833 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1835 def report_download_page(self, query, pagenum):
1836 """Report attempt to download playlist page with given number."""
1837 query = query.decode(preferredencoding())
1838 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1840 def _real_initialize(self):
1841 self._yahoo_ie.initialize()
1843 def _real_extract(self, query):
1844 mobj = re.match(self._VALID_QUERY, query)
1846 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1849 prefix, query = query.split(':')
1851 query = query.encode('utf-8')
1853 self._download_n_results(query, 1)
1855 elif prefix == 'all':
1856 self._download_n_results(query, self._max_yahoo_results)
1862 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1864 elif n > self._max_yahoo_results:
1865 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1866 n = self._max_yahoo_results
1867 self._download_n_results(query, n)
1869 except ValueError: # parsing prefix as integer fails
1870 self._download_n_results(query, 1)
1873 def _download_n_results(self, query, n):
1874 """Downloads a specified number of results for a query"""
1877 already_seen = set()
1881 self.report_download_page(query, pagenum)
1882 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1883 request = urllib2.Request(result_url, None, std_headers)
1885 page = urllib2.urlopen(request).read()
1886 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1887 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1890 # Extract video identifiers
1891 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1892 video_id = mobj.group(1)
1893 if video_id not in already_seen:
1894 video_ids.append(video_id)
1895 already_seen.add(video_id)
1896 if len(video_ids) == n:
1897 # Specified n videos reached
1898 for id in video_ids:
1899 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1902 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1903 for id in video_ids:
1904 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1907 pagenum = pagenum + 1
1909 class YoutubePlaylistIE(InfoExtractor):
1910 """Information Extractor for YouTube playlists."""
1912 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1913 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1914 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1915 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1918 def __init__(self, youtube_ie, downloader=None):
1919 InfoExtractor.__init__(self, downloader)
1920 self._youtube_ie = youtube_ie
1924 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1926 def report_download_page(self, playlist_id, pagenum):
1927 """Report attempt to download playlist page with given number."""
1928 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1930 def _real_initialize(self):
1931 self._youtube_ie.initialize()
1933 def _real_extract(self, url):
1934 # Extract playlist id
1935 mobj = re.match(self._VALID_URL, url)
1937 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1940 # Download playlist pages
1941 playlist_id = mobj.group(1)
1946 self.report_download_page(playlist_id, pagenum)
1947 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1949 page = urllib2.urlopen(request).read()
1950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1951 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1954 # Extract video identifiers
1956 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1957 if mobj.group(1) not in ids_in_page:
1958 ids_in_page.append(mobj.group(1))
1959 video_ids.extend(ids_in_page)
1961 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1963 pagenum = pagenum + 1
1965 playliststart = self._downloader.params.get('playliststart', 1)
1966 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1967 if playliststart > 0:
1968 video_ids = video_ids[playliststart:]
1970 for id in video_ids:
1971 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1974 class YoutubeUserIE(InfoExtractor):
1975 """Information Extractor for YouTube users."""
1977 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1978 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1979 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1982 def __init__(self, youtube_ie, downloader=None):
1983 InfoExtractor.__init__(self, downloader)
1984 self._youtube_ie = youtube_ie
1988 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1990 def report_download_page(self, username):
1991 """Report attempt to download user page."""
1992 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1994 def _real_initialize(self):
1995 self._youtube_ie.initialize()
1997 def _real_extract(self, url):
1999 mobj = re.match(self._VALID_URL, url)
2001 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2004 # Download user page
2005 username = mobj.group(1)
2009 self.report_download_page(username)
2010 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2012 page = urllib2.urlopen(request).read()
2013 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2014 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2017 # Extract video identifiers
2020 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2021 if mobj.group(1) not in ids_in_page:
2022 ids_in_page.append(mobj.group(1))
2023 video_ids.extend(ids_in_page)
2025 playliststart = self._downloader.params.get('playliststart', 1)
2026 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2027 if playliststart > 0:
2028 video_ids = video_ids[playliststart:]
2030 for id in video_ids:
2031 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2034 class PostProcessor(object):
2035 """Post Processor class.
2037 PostProcessor objects can be added to downloaders with their
2038 add_post_processor() method. When the downloader has finished a
2039 successful download, it will take its internal chain of PostProcessors
2040 and start calling the run() method on each one of them, first with
2041 an initial argument and then with the returned value of the previous
2044 The chain will be stopped if one of them ever returns None or the end
2045 of the chain is reached.
2047 PostProcessor objects follow a "mutual registration" process similar
2048 to InfoExtractor objects.
2053 def __init__(self, downloader=None):
2054 self._downloader = downloader
2056 def set_downloader(self, downloader):
2057 """Sets the downloader for this PP."""
2058 self._downloader = downloader
2060 def run(self, information):
2061 """Run the PostProcessor.
2063 The "information" argument is a dictionary like the ones
2064 composed by InfoExtractors. The only difference is that this
2065 one has an extra field called "filepath" that points to the
2068 When this method returns None, the postprocessing chain is
2069 stopped. However, this method may return an information
2070 dictionary that will be passed to the next postprocessing
2071 object in the chain. It can be the one it received after
2072 changing some fields.
2074 In addition, this method may raise a PostProcessingError
2075 exception that will be taken into account by the downloader
2078 return information # by default, do nothing
2080 ### MAIN PROGRAM ###
2081 if __name__ == '__main__':
2083 # Modules needed only when running the main program
2087 # Function to update the program file with the latest version from bitbucket.org
2088 def update_self(downloader, filename):
2089 # Note: downloader only used for options
2090 if not os.access (filename, os.W_OK):
2091 sys.exit('ERROR: no write permissions on %s' % filename)
2093 downloader.to_stdout('Updating to latest stable version...')
2094 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2095 latest_version = urllib.urlopen(latest_url).read().strip()
2096 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2097 newcontent = urllib.urlopen(prog_url).read()
2098 stream = open(filename, 'w')
2099 stream.write(newcontent)
2101 downloader.to_stdout('Updated to version %s' % latest_version)
2103 # Parse command line
2104 parser = optparse.OptionParser(
2105 usage='Usage: %prog [options] url...',
2106 version='2010.10.03',
2107 conflict_handler='resolve',
2110 parser.add_option('-h', '--help',
2111 action='help', help='print this help text and exit')
2112 parser.add_option('-v', '--version',
2113 action='version', help='print program version and exit')
2114 parser.add_option('-U', '--update',
2115 action='store_true', dest='update_self', help='update this program to latest stable version')
2116 parser.add_option('-i', '--ignore-errors',
2117 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2118 parser.add_option('-r', '--rate-limit',
2119 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2120 parser.add_option('-R', '--retries',
2121 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2122 parser.add_option('--playlist-start',
2123 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2125 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2126 authentication.add_option('-u', '--username',
2127 dest='username', metavar='USERNAME', help='account username')
2128 authentication.add_option('-p', '--password',
2129 dest='password', metavar='PASSWORD', help='account password')
2130 authentication.add_option('-n', '--netrc',
2131 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2132 parser.add_option_group(authentication)
2134 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2135 video_format.add_option('-f', '--format',
2136 action='store', dest='format', metavar='FORMAT', help='video format code')
2137 video_format.add_option('-m', '--mobile-version',
2138 action='store_const', dest='format', help='alias for -f 17', const='17')
2139 video_format.add_option('--all-formats',
2140 action='store_const', dest='format', help='download all available video formats', const='-1')
2141 video_format.add_option('--max-quality',
2142 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2143 video_format.add_option('-b', '--best-quality',
2144 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2145 parser.add_option_group(video_format)
2147 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2148 verbosity.add_option('-q', '--quiet',
2149 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2150 verbosity.add_option('-s', '--simulate',
2151 action='store_true', dest='simulate', help='do not download video', default=False)
2152 verbosity.add_option('-g', '--get-url',
2153 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2154 verbosity.add_option('-e', '--get-title',
2155 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2156 verbosity.add_option('--get-thumbnail',
2157 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2158 verbosity.add_option('--get-description',
2159 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2160 verbosity.add_option('--no-progress',
2161 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2162 parser.add_option_group(verbosity)
2164 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2165 filesystem.add_option('-t', '--title',
2166 action='store_true', dest='usetitle', help='use title in file name', default=False)
2167 filesystem.add_option('-l', '--literal',
2168 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2169 filesystem.add_option('-o', '--output',
2170 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2171 filesystem.add_option('-a', '--batch-file',
2172 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2173 filesystem.add_option('-w', '--no-overwrites',
2174 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2175 filesystem.add_option('-c', '--continue',
2176 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2177 filesystem.add_option('--cookies',
2178 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2179 parser.add_option_group(filesystem)
2181 (opts, args) = parser.parse_args()
2183 # Open appropriate CookieJar
2184 if opts.cookiefile is None:
2185 jar = cookielib.CookieJar()
2188 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2189 except (IOError, OSError), err:
2190 sys.exit(u'ERROR: unable to open cookie file')
2192 # General configuration
2193 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2194 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2195 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2196 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2198 # Batch file verification
2200 if opts.batchfile is not None:
2202 if opts.batchfile == '-':
2205 batchfd = open(opts.batchfile, 'r')
2206 batchurls = batchfd.readlines()
2207 batchurls = [x.strip() for x in batchurls]
2208 batchurls = [x for x in batchurls if len(x) > 0]
2210 sys.exit(u'ERROR: batch file could not be read')
2211 all_urls = batchurls + args
2213 # Conflicting, missing and erroneous options
2214 if opts.bestquality:
2215 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2216 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2217 parser.error(u'using .netrc conflicts with giving username/password')
2218 if opts.password is not None and opts.username is None:
2219 parser.error(u'account username missing')
2220 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2221 parser.error(u'using output template conflicts with using title or literal title')
2222 if opts.usetitle and opts.useliteral:
2223 parser.error(u'using title conflicts with using literal title')
2224 if opts.username is not None and opts.password is None:
2225 opts.password = getpass.getpass(u'Type account password and press return:')
2226 if opts.ratelimit is not None:
2227 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2228 if numeric_limit is None:
2229 parser.error(u'invalid rate limit specified')
2230 opts.ratelimit = numeric_limit
2231 if opts.retries is not None:
2233 opts.retries = long(opts.retries)
2234 except (TypeError, ValueError), err:
2235 parser.error(u'invalid retry count specified')
2236 if opts.playliststart is not None:
2238 opts.playliststart = long(opts.playliststart)
2239 except (TypeError, ValueError), err:
2240 parser.error(u'invalid playlist page specified')
2242 # Information extractors
2243 youtube_ie = YoutubeIE()
2244 metacafe_ie = MetacafeIE(youtube_ie)
2245 dailymotion_ie = DailymotionIE()
2246 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2247 youtube_user_ie = YoutubeUserIE(youtube_ie)
2248 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2249 google_ie = GoogleIE()
2250 google_search_ie = GoogleSearchIE(google_ie)
2251 photobucket_ie = PhotobucketIE()
2252 yahoo_ie = YahooIE()
2253 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2254 generic_ie = GenericIE()
2257 fd = FileDownloader({
2258 'usenetrc': opts.usenetrc,
2259 'username': opts.username,
2260 'password': opts.password,
2261 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2262 'forceurl': opts.geturl,
2263 'forcetitle': opts.gettitle,
2264 'forcethumbnail': opts.getthumbnail,
2265 'forcedescription': opts.getdescription,
2266 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2267 'format': opts.format,
2268 'format_limit': opts.format_limit,
2269 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2270 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2271 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2272 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2273 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2274 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2275 or u'%(id)s.%(ext)s'),
2276 'ignoreerrors': opts.ignoreerrors,
2277 'ratelimit': opts.ratelimit,
2278 'nooverwrites': opts.nooverwrites,
2279 'retries': opts.retries,
2280 'continuedl': opts.continue_dl,
2281 'noprogress': opts.noprogress,
2282 'playliststart': opts.playliststart,
2284 fd.add_info_extractor(youtube_search_ie)
2285 fd.add_info_extractor(youtube_pl_ie)
2286 fd.add_info_extractor(youtube_user_ie)
2287 fd.add_info_extractor(metacafe_ie)
2288 fd.add_info_extractor(dailymotion_ie)
2289 fd.add_info_extractor(youtube_ie)
2290 fd.add_info_extractor(google_ie)
2291 fd.add_info_extractor(google_search_ie)
2292 fd.add_info_extractor(photobucket_ie)
2293 fd.add_info_extractor(yahoo_ie)
2294 fd.add_info_extractor(yahoo_search_ie)
2296 # This must come last since it's the
2297 # fallback if none of the others work
2298 fd.add_info_extractor(generic_ie)
2301 if opts.update_self:
2302 update_self(fd, sys.argv[0])
2305 if len(all_urls) < 1:
2306 if not opts.update_self:
2307 parser.error(u'you must provide at least one URL')
2310 retcode = fd.download(all_urls)
2312 # Dump cookie jar if requested
2313 if opts.cookiefile is not None:
2316 except (IOError, OSError), err:
2317 sys.exit(u'ERROR: unable to save cookie jar')
2321 except DownloadError:
2323 except SameFileError:
2324 sys.exit(u'ERROR: fixed output name but more than one file to download')
2325 except KeyboardInterrupt:
2326 sys.exit(u'\nERROR: Interrupted by user')