2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
25 # parse_qs was moved from the cgi module to the urlparse module recently.
27 from urlparse import parse_qs
29 from cgi import parse_qs
32 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
33 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
34 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35 'Accept-Language': 'en-us,en;q=0.5',
38 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40 def preferredencoding():
41 """Get preferred encoding.
43 Returns the best encoding scheme for the system, based on
44 locale.getpreferredencoding() and some further tweaks.
46 def yield_preferredencoding():
48 pref = locale.getpreferredencoding()
54 return yield_preferredencoding().next()
56 def htmlentity_transform(matchobj):
57 """Transforms an HTML entity to a Unicode character.
59 This function receives a match object and is intended to be used with
60 the re.sub() function.
62 entity = matchobj.group(1)
64 # Known non-numeric HTML entity
65 if entity in htmlentitydefs.name2codepoint:
66 return unichr(htmlentitydefs.name2codepoint[entity])
69 mobj = re.match(ur'(?u)#(x?\d+)', entity)
71 numstr = mobj.group(1)
72 if numstr.startswith(u'x'):
74 numstr = u'0%s' % numstr
77 return unichr(long(numstr, base))
79 # Unknown entity in name, return its literal representation
80 return (u'&%s;' % entity)
82 def sanitize_title(utitle):
83 """Sanitizes a video title so it could be used as part of a filename."""
84 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
85 return utitle.replace(unicode(os.sep), u'%')
87 def sanitize_open(filename, open_mode):
88 """Try to open the given filename, and slightly tweak it if this fails.
90 Attempts to open the given filename. If this fails, it tries to change
91 the filename slightly, step by step, until it's either able to open it
92 or it fails and raises a final exception, like the standard open()
95 It returns the tuple (stream, definitive_file_name).
99 if sys.platform == 'win32':
101 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
102 return (sys.stdout, filename)
103 stream = open(filename, open_mode)
104 return (stream, filename)
105 except (IOError, OSError), err:
106 # In case of error, try to remove win32 forbidden chars
107 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109 # An exception here should be caught in the caller
110 stream = open(filename, open_mode)
111 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def temp_name(filename):
240 """Returns a temporary filename for the given filename."""
241 return filename + '.part'
244 def format_bytes(bytes):
247 if type(bytes) is str:
252 exponent = long(math.log(bytes, 1024.0))
253 suffix = 'bkMGTPEZY'[exponent]
254 converted = float(bytes) / float(1024**exponent)
255 return '%.2f%s' % (converted, suffix)
258 def calc_percent(byte_counter, data_len):
261 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
264 def calc_eta(start, now, total, current):
268 if current == 0 or dif < 0.001: # One millisecond
270 rate = float(current) / dif
271 eta = long((float(total) - float(current)) / rate)
272 (eta_mins, eta_secs) = divmod(eta, 60)
275 return '%02d:%02d' % (eta_mins, eta_secs)
278 def calc_speed(start, now, bytes):
280 if bytes == 0 or dif < 0.001: # One millisecond
281 return '%10s' % '---b/s'
282 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
285 def best_block_size(elapsed_time, bytes):
286 new_min = max(bytes / 2.0, 1.0)
287 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
288 if elapsed_time < 0.001:
290 rate = bytes / elapsed_time
298 def parse_bytes(bytestr):
299 """Parse a string indicating a byte quantity into a long integer."""
300 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
303 number = float(matchobj.group(1))
304 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
305 return long(round(number * multiplier))
307 def add_info_extractor(self, ie):
308 """Add an InfoExtractor object to the end of the list."""
310 ie.set_downloader(self)
312 def add_post_processor(self, pp):
313 """Add a PostProcessor object to the end of the chain."""
315 pp.set_downloader(self)
317 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
318 """Print message to stdout if not in quiet mode."""
320 if not self.params.get('quiet', False):
321 terminator = [u'\n', u''][skip_eol]
322 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
323 self._screen_file.flush()
324 except (UnicodeEncodeError), err:
325 if not ignore_encoding_errors:
328 def to_stderr(self, message):
329 """Print message to stderr."""
330 print >>sys.stderr, message.encode(preferredencoding())
332 def fixed_template(self):
333 """Checks if the output template is fixed."""
334 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
336 def trouble(self, message=None):
337 """Determine action to take when a download problem appears.
339 Depending on if the downloader has been configured to ignore
340 download errors or not, this method may throw an exception or
341 not when errors are found, after printing the message.
343 if message is not None:
344 self.to_stderr(message)
345 if not self.params.get('ignoreerrors', False):
346 raise DownloadError(message)
347 self._download_retcode = 1
349 def slow_down(self, start_time, byte_counter):
350 """Sleep if the download speed is over the rate limit."""
351 rate_limit = self.params.get('ratelimit', None)
352 if rate_limit is None or byte_counter == 0:
355 elapsed = now - start_time
358 speed = float(byte_counter) / elapsed
359 if speed > rate_limit:
360 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
362 def try_rename(self, old_filename, new_filename):
364 os.rename(old_filename, new_filename)
365 except (IOError, OSError), err:
366 self.trouble(u'ERROR: unable to rename file')
368 def report_destination(self, filename):
369 """Report destination filename."""
370 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
372 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
373 """Report download progress."""
374 if self.params.get('noprogress', False):
376 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
377 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
379 def report_resuming_byte(self, resume_len):
380 """Report attempt to resume at given byte."""
381 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
383 def report_retry(self, count, retries):
384 """Report retry in case of HTTP error 5xx"""
385 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
387 def report_file_already_downloaded(self, file_name):
388 """Report file has already been fully downloaded."""
390 self.to_screen(u'[download] %s has already been downloaded' % file_name)
391 except (UnicodeEncodeError), err:
392 self.to_screen(u'[download] The file has already been downloaded')
394 def report_unable_to_resume(self):
395 """Report it was impossible to resume download."""
396 self.to_screen(u'[download] Unable to resume')
398 def report_finish(self):
399 """Report download finished."""
400 if self.params.get('noprogress', False):
401 self.to_screen(u'[download] Download completed')
405 def increment_downloads(self):
406 """Increment the ordinal that assigns a number to each file."""
407 self._num_downloads += 1
409 def process_info(self, info_dict):
410 """Process a single dictionary returned by an InfoExtractor."""
411 # Do nothing else if in simulate mode
412 if self.params.get('simulate', False):
414 if self.params.get('forcetitle', False):
415 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
416 if self.params.get('forceurl', False):
417 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
418 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
419 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self.params.get('forcedescription', False) and 'description' in info_dict:
421 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
426 template_dict = dict(info_dict)
427 template_dict['epoch'] = unicode(long(time.time()))
428 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
429 filename = self.params['outtmpl'] % template_dict
430 except (ValueError, KeyError), err:
431 self.trouble(u'ERROR: invalid system charset or erroneous output template')
433 if self.params.get('nooverwrites', False) and os.path.exists(filename):
434 self.to_stderr(u'WARNING: file exists and will be skipped')
438 self.pmkdir(filename)
439 except (OSError, IOError), err:
440 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
444 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
445 except (OSError, IOError), err:
446 raise UnavailableVideoError
447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
448 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
450 except (ContentTooShortError, ), err:
451 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
456 self.post_process(filename, info_dict)
457 except (PostProcessingError), err:
458 self.trouble(u'ERROR: postprocessing: %s' % str(err))
461 def download(self, url_list):
462 """Download a given list of URLs."""
463 if len(url_list) > 1 and self.fixed_template():
464 raise SameFileError(self.params['outtmpl'])
467 suitable_found = False
469 # Go to next InfoExtractor if not suitable
470 if not ie.suitable(url):
473 # Suitable InfoExtractor found
474 suitable_found = True
476 # Extract information from URL and process it
479 # Suitable InfoExtractor had been found; go to next URL
482 if not suitable_found:
483 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
485 return self._download_retcode
487 def post_process(self, filename, ie_info):
488 """Run the postprocessing chain on the given file."""
490 info['filepath'] = filename
496 def _download_with_rtmpdump(self, filename, url, player_url):
497 self.report_destination(filename)
498 tmpfilename = self.temp_name(filename)
500 # Check for rtmpdump first
502 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
503 except (OSError, IOError):
504 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
507 # Download using rtmpdump. rtmpdump returns exit code 2 when
508 # the connection was interrumpted and resuming appears to be
509 # possible. This is part of rtmpdump's normal usage, AFAIK.
510 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
511 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
512 while retval == 2 or retval == 1:
513 prevsize = os.path.getsize(tmpfilename)
514 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
515 time.sleep(5.0) # This seems to be needed
516 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
517 cursize = os.path.getsize(tmpfilename)
518 if prevsize == cursize and retval == 1:
521 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
522 self.try_rename(tmpfilename, filename)
525 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
528 def _do_download(self, filename, url, player_url):
529 # Check file already present
530 if self.params.get('continuedl', False) and os.path.isfile(filename):
531 self.report_file_already_downloaded(filename)
534 # Attempt to download using rtmpdump
535 if url.startswith('rtmp'):
536 return self._download_with_rtmpdump(filename, url, player_url)
538 tmpfilename = self.temp_name(filename)
541 basic_request = urllib2.Request(url, None, std_headers)
542 request = urllib2.Request(url, None, std_headers)
544 # Establish possible resume length
545 if os.path.isfile(tmpfilename):
546 resume_len = os.path.getsize(tmpfilename)
550 # Request parameters in case of being able to resume
551 if self.params.get('continuedl', False) and resume_len != 0:
552 self.report_resuming_byte(resume_len)
553 request.add_header('Range','bytes=%d-' % resume_len)
557 retries = self.params.get('retries', 0)
558 while count <= retries:
559 # Establish connection
561 data = urllib2.urlopen(request)
563 except (urllib2.HTTPError, ), err:
564 if (err.code < 500 or err.code >= 600) and err.code != 416:
565 # Unexpected HTTP error
567 elif err.code == 416:
568 # Unable to resume (requested range not satisfiable)
570 # Open the connection again without the range header
571 data = urllib2.urlopen(basic_request)
572 content_length = data.info()['Content-Length']
573 except (urllib2.HTTPError, ), err:
574 if err.code < 500 or err.code >= 600:
577 # Examine the reported length
578 if (content_length is not None and
579 (resume_len - 100 < long(content_length) < resume_len + 100)):
580 # The file had already been fully downloaded.
581 # Explanation to the above condition: in issue #175 it was revealed that
582 # YouTube sometimes adds or removes a few bytes from the end of the file,
583 # changing the file size slightly and causing problems for some users. So
584 # I decided to implement a suggested change and consider the file
585 # completely downloaded if the file size differs less than 100 bytes from
586 # the one in the hard drive.
587 self.report_file_already_downloaded(filename)
588 self.try_rename(tmpfilename, filename)
591 # The length does not match, we start the download over
592 self.report_unable_to_resume()
598 self.report_retry(count, retries)
601 self.trouble(u'ERROR: giving up after %s retries' % retries)
604 data_len = data.info().get('Content-length', None)
605 data_len_str = self.format_bytes(data_len)
612 data_block = data.read(block_size)
614 data_block_len = len(data_block)
615 if data_block_len == 0:
617 byte_counter += data_block_len
619 # Open file just in time
622 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
623 self.report_destination(filename)
624 except (OSError, IOError), err:
625 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
628 stream.write(data_block)
629 except (IOError, OSError), err:
630 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
632 block_size = self.best_block_size(after - before, data_block_len)
635 percent_str = self.calc_percent(byte_counter, data_len)
636 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
637 speed_str = self.calc_speed(start, time.time(), byte_counter)
638 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
641 self.slow_down(start, byte_counter)
644 if data_len is not None and str(byte_counter) != data_len:
645 raise ContentTooShortError(byte_counter, long(data_len))
646 self.try_rename(tmpfilename, filename)
649 class InfoExtractor(object):
650 """Information Extractor class.
652 Information extractors are the classes that, given a URL, extract
653 information from the video (or videos) the URL refers to. This
654 information includes the real video URL, the video title and simplified
655 title, author and others. The information is stored in a dictionary
656 which is then passed to the FileDownloader. The FileDownloader
657 processes this information possibly downloading the video to the file
658 system, among other possible outcomes. The dictionaries must include
659 the following fields:
661 id: Video identifier.
662 url: Final video URL.
663 uploader: Nickname of the video uploader.
664 title: Literal title.
665 stitle: Simplified title.
666 ext: Video filename extension.
667 format: Video format.
668 player_url: SWF Player URL (may be None).
670 The following fields are optional. Their primary purpose is to allow
671 youtube-dl to serve as the backend for a video search function, such
672 as the one in youtube2mp3. They are only used when their respective
673 forced printing functions are called:
675 thumbnail: Full URL to a video thumbnail image.
676 description: One-line video description.
678 Subclasses of this one should re-define the _real_initialize() and
679 _real_extract() methods, as well as the suitable() static method.
680 Probably, they should also be instantiated and added to the main
687 def __init__(self, downloader=None):
688 """Constructor. Receives an optional downloader."""
690 self.set_downloader(downloader)
694 """Receives a URL and returns True if suitable for this IE."""
697 def initialize(self):
698 """Initializes an instance (authentication, etc)."""
700 self._real_initialize()
703 def extract(self, url):
704 """Extracts URL information and returns it in list of dicts."""
706 return self._real_extract(url)
708 def set_downloader(self, downloader):
709 """Sets the downloader for this IE."""
710 self._downloader = downloader
712 def _real_initialize(self):
713 """Real initialization process. Redefine in subclasses."""
716 def _real_extract(self, url):
717 """Real extraction process. Redefine in subclasses."""
720 class YoutubeIE(InfoExtractor):
721 """Information extractor for youtube.com."""
723 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
724 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
725 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
726 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
727 _NETRC_MACHINE = 'youtube'
728 # Listed in order of quality
729 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
730 _video_extensions = {
736 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
743 return (re.match(YoutubeIE._VALID_URL, url) is not None)
745 def report_lang(self):
746 """Report attempt to set language."""
747 self._downloader.to_screen(u'[youtube] Setting language')
749 def report_login(self):
750 """Report attempt to log in."""
751 self._downloader.to_screen(u'[youtube] Logging in')
753 def report_age_confirmation(self):
754 """Report attempt to confirm age."""
755 self._downloader.to_screen(u'[youtube] Confirming age')
757 def report_video_webpage_download(self, video_id):
758 """Report attempt to download video webpage."""
759 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
761 def report_video_info_webpage_download(self, video_id):
762 """Report attempt to download video info webpage."""
763 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
765 def report_information_extraction(self, video_id):
766 """Report attempt to extract video information."""
767 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
769 def report_unavailable_format(self, video_id, format):
770 """Report extracted video URL."""
771 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
773 def report_rtmp_download(self):
774 """Indicate the download will use the RTMP protocol."""
775 self._downloader.to_screen(u'[youtube] RTMP download detected')
777 def _real_initialize(self):
778 if self._downloader is None:
783 downloader_params = self._downloader.params
785 # Attempt to use provided username and password or .netrc data
786 if downloader_params.get('username', None) is not None:
787 username = downloader_params['username']
788 password = downloader_params['password']
789 elif downloader_params.get('usenetrc', False):
791 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
796 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
797 except (IOError, netrc.NetrcParseError), err:
798 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
802 request = urllib2.Request(self._LANG_URL, None, std_headers)
805 urllib2.urlopen(request).read()
806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
810 # No authentication to be performed
816 'current_form': 'loginForm',
818 'action_login': 'Log In',
819 'username': username,
820 'password': password,
822 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
825 login_results = urllib2.urlopen(request).read()
826 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
827 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
830 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
836 'action_confirm': 'Confirm',
838 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
840 self.report_age_confirmation()
841 age_results = urllib2.urlopen(request).read()
842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
843 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
846 def _real_extract(self, url):
847 # Extract video id from URL
848 mobj = re.match(self._VALID_URL, url)
850 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
852 video_id = mobj.group(2)
855 self.report_video_webpage_download(video_id)
856 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
858 video_webpage = urllib2.urlopen(request).read()
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
863 # Attempt to extract SWF player URL
864 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
866 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
871 self.report_video_info_webpage_download(video_id)
872 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
873 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
874 % (video_id, el_type))
875 request = urllib2.Request(video_info_url, None, std_headers)
877 video_info_webpage = urllib2.urlopen(request).read()
878 video_info = parse_qs(video_info_webpage)
879 if 'token' in video_info:
881 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
882 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
884 if 'token' not in video_info:
885 if 'reason' in video_info:
886 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
888 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
891 # Start extracting information
892 self.report_information_extraction(video_id)
895 if 'author' not in video_info:
896 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
898 video_uploader = urllib.unquote_plus(video_info['author'][0])
901 if 'title' not in video_info:
902 self._downloader.trouble(u'ERROR: unable to extract video title')
904 video_title = urllib.unquote_plus(video_info['title'][0])
905 video_title = video_title.decode('utf-8')
906 video_title = sanitize_title(video_title)
909 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
910 simple_title = simple_title.strip(ur'_')
913 if 'thumbnail_url' not in video_info:
914 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
916 else: # don't panic if we can't find it
917 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
921 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
923 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
924 format_expressions = ['%d %B %Y', '%B %d %Y']
925 for expression in format_expressions:
927 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
932 video_description = 'No description available.'
933 if self._downloader.params.get('forcedescription', False):
934 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
936 video_description = mobj.group(1)
939 video_token = urllib.unquote_plus(video_info['token'][0])
941 # Decide which formats to download
942 requested_format = self._downloader.params.get('format', None)
943 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
945 if 'fmt_url_map' in video_info:
946 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
947 format_limit = self._downloader.params.get('format_limit', None)
948 if format_limit is not None and format_limit in self._available_formats:
949 format_list = self._available_formats[self._available_formats.index(format_limit):]
951 format_list = self._available_formats
952 existing_formats = [x for x in format_list if x in url_map]
953 if len(existing_formats) == 0:
954 self._downloader.trouble(u'ERROR: no known formats available for video')
956 if requested_format is None:
957 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
958 elif requested_format == '-1':
959 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
961 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
963 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
964 self.report_rtmp_download()
965 video_url_list = [(None, video_info['conn'][0])]
968 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
971 for format_param, video_real_url in video_url_list:
972 # At this point we have a new video
973 self._downloader.increment_downloads()
976 video_extension = self._video_extensions.get(format_param, 'flv')
978 # Find the video URL in fmt_url_map or conn paramters
980 # Process video information
981 self._downloader.process_info({
982 'id': video_id.decode('utf-8'),
983 'url': video_real_url.decode('utf-8'),
984 'uploader': video_uploader.decode('utf-8'),
985 'upload_date': upload_date,
986 'title': video_title,
987 'stitle': simple_title,
988 'ext': video_extension.decode('utf-8'),
989 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
990 'thumbnail': video_thumbnail.decode('utf-8'),
991 'description': video_description.decode('utf-8'),
992 'player_url': player_url,
994 except UnavailableVideoError, err:
995 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
998 class MetacafeIE(InfoExtractor):
999 """Information Extractor for metacafe.com."""
1001 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1002 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1003 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1006 def __init__(self, youtube_ie, downloader=None):
1007 InfoExtractor.__init__(self, downloader)
1008 self._youtube_ie = youtube_ie
1012 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1014 def report_disclaimer(self):
1015 """Report disclaimer retrieval."""
1016 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1018 def report_age_confirmation(self):
1019 """Report attempt to confirm age."""
1020 self._downloader.to_screen(u'[metacafe] Confirming age')
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1026 def report_extraction(self, video_id):
1027 """Report information extraction."""
1028 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1030 def _real_initialize(self):
1031 # Retrieve disclaimer
1032 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1034 self.report_disclaimer()
1035 disclaimer = urllib2.urlopen(request).read()
1036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1043 'submit': "Continue - I'm over 18",
1045 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1047 self.report_age_confirmation()
1048 disclaimer = urllib2.urlopen(request).read()
1049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1053 def _real_extract(self, url):
1054 # Extract id and simplified title from URL
1055 mobj = re.match(self._VALID_URL, url)
1057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1060 video_id = mobj.group(1)
1062 # Check if video comes from YouTube
1063 mobj2 = re.match(r'^yt-(.*)$', video_id)
1064 if mobj2 is not None:
1065 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1068 # At this point we have a new video
1069 self._downloader.increment_downloads()
1071 simple_title = mobj.group(2).decode('utf-8')
1073 # Retrieve video webpage to extract further information
1074 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1076 self.report_download_webpage(video_id)
1077 webpage = urllib2.urlopen(request).read()
1078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1079 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1082 # Extract URL, uploader and title from webpage
1083 self.report_extraction(video_id)
1084 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1085 if mobj is not None:
1086 mediaURL = urllib.unquote(mobj.group(1))
1087 video_extension = mediaURL[-3:]
1089 # Extract gdaKey if available
1090 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1092 video_url = mediaURL
1094 gdaKey = mobj.group(1)
1095 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1097 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1099 self._downloader.trouble(u'ERROR: unable to extract media URL')
1101 vardict = parse_qs(mobj.group(1))
1102 if 'mediaData' not in vardict:
1103 self._downloader.trouble(u'ERROR: unable to extract media URL')
1105 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1107 self._downloader.trouble(u'ERROR: unable to extract media URL')
1109 mediaURL = mobj.group(1).replace('\\/', '/')
1110 video_extension = mediaURL[-3:]
1111 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1113 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1115 self._downloader.trouble(u'ERROR: unable to extract title')
1117 video_title = mobj.group(1).decode('utf-8')
1118 video_title = sanitize_title(video_title)
1120 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1122 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1124 video_uploader = mobj.group(1)
1127 # Process video information
1128 self._downloader.process_info({
1129 'id': video_id.decode('utf-8'),
1130 'url': video_url.decode('utf-8'),
1131 'uploader': video_uploader.decode('utf-8'),
1132 'upload_date': u'NA',
1133 'title': video_title,
1134 'stitle': simple_title,
1135 'ext': video_extension.decode('utf-8'),
1139 except UnavailableVideoError:
1140 self._downloader.trouble(u'ERROR: unable to download video')
1143 class DailymotionIE(InfoExtractor):
1144 """Information Extractor for Dailymotion"""
1146 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1148 def __init__(self, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1153 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1155 def report_download_webpage(self, video_id):
1156 """Report webpage download."""
1157 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1159 def report_extraction(self, video_id):
1160 """Report information extraction."""
1161 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1163 def _real_initialize(self):
1166 def _real_extract(self, url):
1167 # Extract id and simplified title from URL
1168 mobj = re.match(self._VALID_URL, url)
1170 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1173 # At this point we have a new video
1174 self._downloader.increment_downloads()
1175 video_id = mobj.group(1)
1177 simple_title = mobj.group(2).decode('utf-8')
1178 video_extension = 'flv'
1180 # Retrieve video webpage to extract further information
1181 request = urllib2.Request(url)
1183 self.report_download_webpage(video_id)
1184 webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1189 # Extract URL, uploader and title from webpage
1190 self.report_extraction(video_id)
1191 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1193 self._downloader.trouble(u'ERROR: unable to extract media URL')
1195 mediaURL = urllib.unquote(mobj.group(1))
1197 # if needed add http://www.dailymotion.com/ if relative URL
1199 video_url = mediaURL
1201 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1202 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1206 video_title = mobj.group(1).decode('utf-8')
1207 video_title = sanitize_title(video_title)
1209 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1211 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1213 video_uploader = mobj.group(1)
1216 # Process video information
1217 self._downloader.process_info({
1218 'id': video_id.decode('utf-8'),
1219 'url': video_url.decode('utf-8'),
1220 'uploader': video_uploader.decode('utf-8'),
1221 'upload_date': u'NA',
1222 'title': video_title,
1223 'stitle': simple_title,
1224 'ext': video_extension.decode('utf-8'),
1228 except UnavailableVideoError:
1229 self._downloader.trouble(u'ERROR: unable to download video')
1231 class GoogleIE(InfoExtractor):
1232 """Information extractor for video.google.com."""
1234 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1236 def __init__(self, downloader=None):
1237 InfoExtractor.__init__(self, downloader)
1241 return (re.match(GoogleIE._VALID_URL, url) is not None)
1243 def report_download_webpage(self, video_id):
1244 """Report webpage download."""
1245 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1247 def report_extraction(self, video_id):
1248 """Report information extraction."""
1249 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1251 def _real_initialize(self):
1254 def _real_extract(self, url):
1255 # Extract id from URL
1256 mobj = re.match(self._VALID_URL, url)
1258 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1261 # At this point we have a new video
1262 self._downloader.increment_downloads()
1263 video_id = mobj.group(1)
1265 video_extension = 'mp4'
1267 # Retrieve video webpage to extract further information
1268 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1270 self.report_download_webpage(video_id)
1271 webpage = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1276 # Extract URL, uploader, and title from webpage
1277 self.report_extraction(video_id)
1278 mobj = re.search(r"download_url:'([^']+)'", webpage)
1280 video_extension = 'flv'
1281 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1283 self._downloader.trouble(u'ERROR: unable to extract media URL')
1285 mediaURL = urllib.unquote(mobj.group(1))
1286 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1287 mediaURL = mediaURL.replace('\\x26', '\x26')
1289 video_url = mediaURL
1291 mobj = re.search(r'<title>(.*)</title>', webpage)
1293 self._downloader.trouble(u'ERROR: unable to extract title')
1295 video_title = mobj.group(1).decode('utf-8')
1296 video_title = sanitize_title(video_title)
1297 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1299 # Extract video description
1300 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1302 self._downloader.trouble(u'ERROR: unable to extract video description')
1304 video_description = mobj.group(1).decode('utf-8')
1305 if not video_description:
1306 video_description = 'No description available.'
1308 # Extract video thumbnail
1309 if self._downloader.params.get('forcethumbnail', False):
1310 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1312 webpage = urllib2.urlopen(request).read()
1313 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1314 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1316 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1320 video_thumbnail = mobj.group(1)
1321 else: # we need something to pass to process_info
1322 video_thumbnail = ''
1326 # Process video information
1327 self._downloader.process_info({
1328 'id': video_id.decode('utf-8'),
1329 'url': video_url.decode('utf-8'),
1331 'upload_date': u'NA',
1332 'title': video_title,
1333 'stitle': simple_title,
1334 'ext': video_extension.decode('utf-8'),
1338 except UnavailableVideoError:
1339 self._downloader.trouble(u'ERROR: unable to download video')
1342 class PhotobucketIE(InfoExtractor):
1343 """Information extractor for photobucket.com."""
1345 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1347 def __init__(self, downloader=None):
1348 InfoExtractor.__init__(self, downloader)
1352 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1354 def report_download_webpage(self, video_id):
1355 """Report webpage download."""
1356 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1362 def _real_initialize(self):
1365 def _real_extract(self, url):
1366 # Extract id from URL
1367 mobj = re.match(self._VALID_URL, url)
1369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1372 # At this point we have a new video
1373 self._downloader.increment_downloads()
1374 video_id = mobj.group(1)
1376 video_extension = 'flv'
1378 # Retrieve video webpage to extract further information
1379 request = urllib2.Request(url)
1381 self.report_download_webpage(video_id)
1382 webpage = urllib2.urlopen(request).read()
1383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1387 # Extract URL, uploader, and title from webpage
1388 self.report_extraction(video_id)
1389 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1391 self._downloader.trouble(u'ERROR: unable to extract media URL')
1393 mediaURL = urllib.unquote(mobj.group(1))
1395 video_url = mediaURL
1397 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1399 self._downloader.trouble(u'ERROR: unable to extract title')
1401 video_title = mobj.group(1).decode('utf-8')
1402 video_title = sanitize_title(video_title)
1403 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1405 video_uploader = mobj.group(2).decode('utf-8')
1408 # Process video information
1409 self._downloader.process_info({
1410 'id': video_id.decode('utf-8'),
1411 'url': video_url.decode('utf-8'),
1412 'uploader': video_uploader,
1413 'upload_date': u'NA',
1414 'title': video_title,
1415 'stitle': simple_title,
1416 'ext': video_extension.decode('utf-8'),
1420 except UnavailableVideoError:
1421 self._downloader.trouble(u'ERROR: unable to download video')
1424 class YahooIE(InfoExtractor):
1425 """Information extractor for video.yahoo.com."""
1427 # _VALID_URL matches all Yahoo! Video URLs
1428 # _VPAGE_URL matches only the extractable '/watch/' URLs
1429 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1430 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1432 def __init__(self, downloader=None):
1433 InfoExtractor.__init__(self, downloader)
1437 return (re.match(YahooIE._VALID_URL, url) is not None)
1439 def report_download_webpage(self, video_id):
1440 """Report webpage download."""
1441 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1443 def report_extraction(self, video_id):
1444 """Report information extraction."""
1445 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1447 def _real_initialize(self):
1450 def _real_extract(self, url, new_video=True):
1451 # Extract ID from URL
1452 mobj = re.match(self._VALID_URL, url)
1454 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1457 # At this point we have a new video
1458 self._downloader.increment_downloads()
1459 video_id = mobj.group(2)
1460 video_extension = 'flv'
1462 # Rewrite valid but non-extractable URLs as
1463 # extractable English language /watch/ URLs
1464 if re.match(self._VPAGE_URL, url) is None:
1465 request = urllib2.Request(url)
1467 webpage = urllib2.urlopen(request).read()
1468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1472 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1474 self._downloader.trouble(u'ERROR: Unable to extract id field')
1476 yahoo_id = mobj.group(1)
1478 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1480 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1482 yahoo_vid = mobj.group(1)
1484 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1485 return self._real_extract(url, new_video=False)
1487 # Retrieve video webpage to extract further information
1488 request = urllib2.Request(url)
1490 self.report_download_webpage(video_id)
1491 webpage = urllib2.urlopen(request).read()
1492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1496 # Extract uploader and title from webpage
1497 self.report_extraction(video_id)
1498 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1500 self._downloader.trouble(u'ERROR: unable to extract video title')
1502 video_title = mobj.group(1).decode('utf-8')
1503 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1505 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1507 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1509 video_uploader = mobj.group(1).decode('utf-8')
1511 # Extract video thumbnail
1512 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1514 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1516 video_thumbnail = mobj.group(1).decode('utf-8')
1518 # Extract video description
1519 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1521 self._downloader.trouble(u'ERROR: unable to extract video description')
1523 video_description = mobj.group(1).decode('utf-8')
1524 if not video_description: video_description = 'No description available.'
1526 # Extract video height and width
1527 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1529 self._downloader.trouble(u'ERROR: unable to extract video height')
1531 yv_video_height = mobj.group(1)
1533 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1535 self._downloader.trouble(u'ERROR: unable to extract video width')
1537 yv_video_width = mobj.group(1)
1539 # Retrieve video playlist to extract media URL
1540 # I'm not completely sure what all these options are, but we
1541 # seem to need most of them, otherwise the server sends a 401.
1542 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1543 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1544 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1545 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1546 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1548 self.report_download_webpage(video_id)
1549 webpage = urllib2.urlopen(request).read()
1550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1554 # Extract media URL from playlist XML
1555 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1557 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1559 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1560 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1563 # Process video information
1564 self._downloader.process_info({
1565 'id': video_id.decode('utf-8'),
1567 'uploader': video_uploader,
1568 'upload_date': u'NA',
1569 'title': video_title,
1570 'stitle': simple_title,
1571 'ext': video_extension.decode('utf-8'),
1572 'thumbnail': video_thumbnail.decode('utf-8'),
1573 'description': video_description,
1574 'thumbnail': video_thumbnail,
1575 'description': video_description,
1578 except UnavailableVideoError:
1579 self._downloader.trouble(u'ERROR: unable to download video')
1582 class GenericIE(InfoExtractor):
1583 """Generic last-resort information extractor."""
1585 def __init__(self, downloader=None):
1586 InfoExtractor.__init__(self, downloader)
1592 def report_download_webpage(self, video_id):
1593 """Report webpage download."""
1594 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1595 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1597 def report_extraction(self, video_id):
1598 """Report information extraction."""
1599 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1601 def _real_initialize(self):
1604 def _real_extract(self, url):
1605 # At this point we have a new video
1606 self._downloader.increment_downloads()
1608 video_id = url.split('/')[-1]
1609 request = urllib2.Request(url)
1611 self.report_download_webpage(video_id)
1612 webpage = urllib2.urlopen(request).read()
1613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1616 except ValueError, err:
1617 # since this is the last-resort InfoExtractor, if
1618 # this error is thrown, it'll be thrown here
1619 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1622 self.report_extraction(video_id)
1623 # Start with something easy: JW Player in SWFObject
1624 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1626 # Broaden the search a little bit
1627 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1629 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1632 # It's possible that one of the regexes
1633 # matched, but returned an empty group:
1634 if mobj.group(1) is None:
1635 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1638 video_url = urllib.unquote(mobj.group(1))
1639 video_id = os.path.basename(video_url)
1641 # here's a fun little line of code for you:
1642 video_extension = os.path.splitext(video_id)[1][1:]
1643 video_id = os.path.splitext(video_id)[0]
1645 # it's tempting to parse this further, but you would
1646 # have to take into account all the variations like
1647 # Video Title - Site Name
1648 # Site Name | Video Title
1649 # Video Title - Tagline | Site Name
1650 # and so on and so forth; it's just not practical
1651 mobj = re.search(r'<title>(.*)</title>', webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract title')
1655 video_title = mobj.group(1).decode('utf-8')
1656 video_title = sanitize_title(video_title)
1657 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1659 # video uploader is domain name
1660 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1664 video_uploader = mobj.group(1).decode('utf-8')
1667 # Process video information
1668 self._downloader.process_info({
1669 'id': video_id.decode('utf-8'),
1670 'url': video_url.decode('utf-8'),
1671 'uploader': video_uploader,
1672 'upload_date': u'NA',
1673 'title': video_title,
1674 'stitle': simple_title,
1675 'ext': video_extension.decode('utf-8'),
1679 except UnavailableVideoError, err:
1680 self._downloader.trouble(u'ERROR: unable to download video')
1683 class YoutubeSearchIE(InfoExtractor):
1684 """Information Extractor for YouTube search queries."""
1685 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1686 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1687 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1688 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1690 _max_youtube_results = 1000
1692 def __init__(self, youtube_ie, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1694 self._youtube_ie = youtube_ie
1698 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1700 def report_download_page(self, query, pagenum):
1701 """Report attempt to download playlist page with given number."""
1702 query = query.decode(preferredencoding())
1703 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1705 def _real_initialize(self):
1706 self._youtube_ie.initialize()
1708 def _real_extract(self, query):
1709 mobj = re.match(self._VALID_QUERY, query)
1711 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1714 prefix, query = query.split(':')
1716 query = query.encode('utf-8')
1718 self._download_n_results(query, 1)
1720 elif prefix == 'all':
1721 self._download_n_results(query, self._max_youtube_results)
1727 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1729 elif n > self._max_youtube_results:
1730 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1731 n = self._max_youtube_results
1732 self._download_n_results(query, n)
1734 except ValueError: # parsing prefix as integer fails
1735 self._download_n_results(query, 1)
1738 def _download_n_results(self, query, n):
1739 """Downloads a specified number of results for a query"""
1742 already_seen = set()
1746 self.report_download_page(query, pagenum)
1747 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1748 request = urllib2.Request(result_url, None, std_headers)
1750 page = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1755 # Extract video identifiers
1756 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1757 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1758 if video_id not in already_seen:
1759 video_ids.append(video_id)
1760 already_seen.add(video_id)
1761 if len(video_ids) == n:
1762 # Specified n videos reached
1763 for id in video_ids:
1764 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1767 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1768 for id in video_ids:
1769 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1772 pagenum = pagenum + 1
1774 class GoogleSearchIE(InfoExtractor):
1775 """Information Extractor for Google Video search queries."""
1776 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1777 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1778 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1779 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1781 _max_google_results = 1000
1783 def __init__(self, google_ie, downloader=None):
1784 InfoExtractor.__init__(self, downloader)
1785 self._google_ie = google_ie
1789 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1791 def report_download_page(self, query, pagenum):
1792 """Report attempt to download playlist page with given number."""
1793 query = query.decode(preferredencoding())
1794 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1796 def _real_initialize(self):
1797 self._google_ie.initialize()
1799 def _real_extract(self, query):
1800 mobj = re.match(self._VALID_QUERY, query)
1802 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1805 prefix, query = query.split(':')
1807 query = query.encode('utf-8')
1809 self._download_n_results(query, 1)
1811 elif prefix == 'all':
1812 self._download_n_results(query, self._max_google_results)
1818 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1820 elif n > self._max_google_results:
1821 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1822 n = self._max_google_results
1823 self._download_n_results(query, n)
1825 except ValueError: # parsing prefix as integer fails
1826 self._download_n_results(query, 1)
1829 def _download_n_results(self, query, n):
1830 """Downloads a specified number of results for a query"""
1833 already_seen = set()
1837 self.report_download_page(query, pagenum)
1838 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1839 request = urllib2.Request(result_url, None, std_headers)
1841 page = urllib2.urlopen(request).read()
1842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1846 # Extract video identifiers
1847 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1848 video_id = mobj.group(1)
1849 if video_id not in already_seen:
1850 video_ids.append(video_id)
1851 already_seen.add(video_id)
1852 if len(video_ids) == n:
1853 # Specified n videos reached
1854 for id in video_ids:
1855 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1858 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1859 for id in video_ids:
1860 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1863 pagenum = pagenum + 1
1865 class YahooSearchIE(InfoExtractor):
1866 """Information Extractor for Yahoo! Video search queries."""
1867 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1868 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1869 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1870 _MORE_PAGES_INDICATOR = r'\s*Next'
1872 _max_yahoo_results = 1000
1874 def __init__(self, yahoo_ie, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
1876 self._yahoo_ie = yahoo_ie
1880 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1882 def report_download_page(self, query, pagenum):
1883 """Report attempt to download playlist page with given number."""
1884 query = query.decode(preferredencoding())
1885 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1887 def _real_initialize(self):
1888 self._yahoo_ie.initialize()
1890 def _real_extract(self, query):
1891 mobj = re.match(self._VALID_QUERY, query)
1893 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1896 prefix, query = query.split(':')
1898 query = query.encode('utf-8')
1900 self._download_n_results(query, 1)
1902 elif prefix == 'all':
1903 self._download_n_results(query, self._max_yahoo_results)
1909 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1911 elif n > self._max_yahoo_results:
1912 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1913 n = self._max_yahoo_results
1914 self._download_n_results(query, n)
1916 except ValueError: # parsing prefix as integer fails
1917 self._download_n_results(query, 1)
1920 def _download_n_results(self, query, n):
1921 """Downloads a specified number of results for a query"""
1924 already_seen = set()
1928 self.report_download_page(query, pagenum)
1929 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1930 request = urllib2.Request(result_url, None, std_headers)
1932 page = urllib2.urlopen(request).read()
1933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1937 # Extract video identifiers
1938 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1939 video_id = mobj.group(1)
1940 if video_id not in already_seen:
1941 video_ids.append(video_id)
1942 already_seen.add(video_id)
1943 if len(video_ids) == n:
1944 # Specified n videos reached
1945 for id in video_ids:
1946 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1949 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1950 for id in video_ids:
1951 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1954 pagenum = pagenum + 1
1956 class YoutubePlaylistIE(InfoExtractor):
1957 """Information Extractor for YouTube playlists."""
1959 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1960 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1961 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1962 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1965 def __init__(self, youtube_ie, downloader=None):
1966 InfoExtractor.__init__(self, downloader)
1967 self._youtube_ie = youtube_ie
1971 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1973 def report_download_page(self, playlist_id, pagenum):
1974 """Report attempt to download playlist page with given number."""
1975 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1977 def _real_initialize(self):
1978 self._youtube_ie.initialize()
1980 def _real_extract(self, url):
1981 # Extract playlist id
1982 mobj = re.match(self._VALID_URL, url)
1984 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1987 # Download playlist pages
1988 playlist_id = mobj.group(1)
1993 self.report_download_page(playlist_id, pagenum)
1994 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1996 page = urllib2.urlopen(request).read()
1997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2001 # Extract video identifiers
2003 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2004 if mobj.group(1) not in ids_in_page:
2005 ids_in_page.append(mobj.group(1))
2006 video_ids.extend(ids_in_page)
2008 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2010 pagenum = pagenum + 1
2012 playliststart = self._downloader.params.get('playliststart', 1) - 1
2013 playlistend = self._downloader.params.get('playlistend', -1)
2014 video_ids = video_ids[playliststart:playlistend]
2016 for id in video_ids:
2017 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2020 class YoutubeUserIE(InfoExtractor):
2021 """Information Extractor for YouTube users."""
2023 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2024 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2025 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2028 def __init__(self, youtube_ie, downloader=None):
2029 InfoExtractor.__init__(self, downloader)
2030 self._youtube_ie = youtube_ie
2034 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2036 def report_download_page(self, username):
2037 """Report attempt to download user page."""
2038 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2040 def _real_initialize(self):
2041 self._youtube_ie.initialize()
2043 def _real_extract(self, url):
2045 mobj = re.match(self._VALID_URL, url)
2047 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2050 # Download user page
2051 username = mobj.group(1)
2055 self.report_download_page(username)
2056 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2058 page = urllib2.urlopen(request).read()
2059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2060 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2063 # Extract video identifiers
2066 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2067 if mobj.group(1) not in ids_in_page:
2068 ids_in_page.append(mobj.group(1))
2069 video_ids.extend(ids_in_page)
2071 playliststart = self._downloader.params.get('playliststart', 1) - 1
2072 playlistend = self._downloader.params.get('playlistend', -1)
2073 video_ids = video_ids[playliststart:playlistend]
2075 for id in video_ids:
2076 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2079 class PostProcessor(object):
2080 """Post Processor class.
2082 PostProcessor objects can be added to downloaders with their
2083 add_post_processor() method. When the downloader has finished a
2084 successful download, it will take its internal chain of PostProcessors
2085 and start calling the run() method on each one of them, first with
2086 an initial argument and then with the returned value of the previous
2089 The chain will be stopped if one of them ever returns None or the end
2090 of the chain is reached.
2092 PostProcessor objects follow a "mutual registration" process similar
2093 to InfoExtractor objects.
2098 def __init__(self, downloader=None):
2099 self._downloader = downloader
2101 def set_downloader(self, downloader):
2102 """Sets the downloader for this PP."""
2103 self._downloader = downloader
2105 def run(self, information):
2106 """Run the PostProcessor.
2108 The "information" argument is a dictionary like the ones
2109 composed by InfoExtractors. The only difference is that this
2110 one has an extra field called "filepath" that points to the
2113 When this method returns None, the postprocessing chain is
2114 stopped. However, this method may return an information
2115 dictionary that will be passed to the next postprocessing
2116 object in the chain. It can be the one it received after
2117 changing some fields.
2119 In addition, this method may raise a PostProcessingError
2120 exception that will be taken into account by the downloader
2123 return information # by default, do nothing
2125 ### MAIN PROGRAM ###
2126 if __name__ == '__main__':
2128 # Modules needed only when running the main program
2132 # Function to update the program file with the latest version from bitbucket.org
2133 def update_self(downloader, filename):
2134 # Note: downloader only used for options
2135 if not os.access (filename, os.W_OK):
2136 sys.exit('ERROR: no write permissions on %s' % filename)
2138 downloader.to_screen('Updating to latest stable version...')
2139 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2140 latest_version = urllib.urlopen(latest_url).read().strip()
2141 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2142 newcontent = urllib.urlopen(prog_url).read()
2143 stream = open(filename, 'w')
2144 stream.write(newcontent)
2146 downloader.to_screen('Updated to version %s' % latest_version)
2148 # Parse command line
2149 parser = optparse.OptionParser(
2150 usage='Usage: %prog [options] url...',
2151 version='2010.11.19',
2152 conflict_handler='resolve',
2155 parser.add_option('-h', '--help',
2156 action='help', help='print this help text and exit')
2157 parser.add_option('-v', '--version',
2158 action='version', help='print program version and exit')
2159 parser.add_option('-U', '--update',
2160 action='store_true', dest='update_self', help='update this program to latest stable version')
2161 parser.add_option('-i', '--ignore-errors',
2162 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2163 parser.add_option('-r', '--rate-limit',
2164 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2165 parser.add_option('-R', '--retries',
2166 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2167 parser.add_option('--playlist-start',
2168 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2169 parser.add_option('--playlist-end',
2170 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2172 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2173 authentication.add_option('-u', '--username',
2174 dest='username', metavar='USERNAME', help='account username')
2175 authentication.add_option('-p', '--password',
2176 dest='password', metavar='PASSWORD', help='account password')
2177 authentication.add_option('-n', '--netrc',
2178 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2179 parser.add_option_group(authentication)
2181 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2182 video_format.add_option('-f', '--format',
2183 action='store', dest='format', metavar='FORMAT', help='video format code')
2184 video_format.add_option('-m', '--mobile-version',
2185 action='store_const', dest='format', help='alias for -f 17', const='17')
2186 video_format.add_option('--all-formats',
2187 action='store_const', dest='format', help='download all available video formats', const='-1')
2188 video_format.add_option('--max-quality',
2189 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2190 video_format.add_option('-b', '--best-quality',
2191 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2192 parser.add_option_group(video_format)
2194 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2195 verbosity.add_option('-q', '--quiet',
2196 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2197 verbosity.add_option('-s', '--simulate',
2198 action='store_true', dest='simulate', help='do not download video', default=False)
2199 verbosity.add_option('-g', '--get-url',
2200 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2201 verbosity.add_option('-e', '--get-title',
2202 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2203 verbosity.add_option('--get-thumbnail',
2204 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2205 verbosity.add_option('--get-description',
2206 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2207 verbosity.add_option('--no-progress',
2208 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2209 parser.add_option_group(verbosity)
2211 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2212 filesystem.add_option('-t', '--title',
2213 action='store_true', dest='usetitle', help='use title in file name', default=False)
2214 filesystem.add_option('-l', '--literal',
2215 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2216 filesystem.add_option('-A', '--auto-number',
2217 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2218 filesystem.add_option('-o', '--output',
2219 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2220 filesystem.add_option('-a', '--batch-file',
2221 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2222 filesystem.add_option('-w', '--no-overwrites',
2223 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2224 filesystem.add_option('-c', '--continue',
2225 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2226 filesystem.add_option('--cookies',
2227 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2228 parser.add_option_group(filesystem)
2230 (opts, args) = parser.parse_args()
2232 # Open appropriate CookieJar
2233 if opts.cookiefile is None:
2234 jar = cookielib.CookieJar()
2237 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2238 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2240 except (IOError, OSError), err:
2241 sys.exit(u'ERROR: unable to open cookie file')
2243 # General configuration
2244 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2245 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2246 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2247 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2249 # Batch file verification
2251 if opts.batchfile is not None:
2253 if opts.batchfile == '-':
2256 batchfd = open(opts.batchfile, 'r')
2257 batchurls = batchfd.readlines()
2258 batchurls = [x.strip() for x in batchurls]
2259 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2261 sys.exit(u'ERROR: batch file could not be read')
2262 all_urls = batchurls + args
2264 # Conflicting, missing and erroneous options
2265 if opts.bestquality:
2266 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2267 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2268 parser.error(u'using .netrc conflicts with giving username/password')
2269 if opts.password is not None and opts.username is None:
2270 parser.error(u'account username missing')
2271 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2272 parser.error(u'using output template conflicts with using title, literal title or auto number')
2273 if opts.usetitle and opts.useliteral:
2274 parser.error(u'using title conflicts with using literal title')
2275 if opts.username is not None and opts.password is None:
2276 opts.password = getpass.getpass(u'Type account password and press return:')
2277 if opts.ratelimit is not None:
2278 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2279 if numeric_limit is None:
2280 parser.error(u'invalid rate limit specified')
2281 opts.ratelimit = numeric_limit
2282 if opts.retries is not None:
2284 opts.retries = long(opts.retries)
2285 except (TypeError, ValueError), err:
2286 parser.error(u'invalid retry count specified')
2288 opts.playliststart = long(opts.playliststart)
2289 if opts.playliststart <= 0:
2291 except (TypeError, ValueError), err:
2292 parser.error(u'invalid playlist start number specified')
2294 opts.playlistend = long(opts.playlistend)
2295 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2297 except (TypeError, ValueError), err:
2298 parser.error(u'invalid playlist end number specified')
2300 # Information extractors
2301 youtube_ie = YoutubeIE()
2302 metacafe_ie = MetacafeIE(youtube_ie)
2303 dailymotion_ie = DailymotionIE()
2304 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2305 youtube_user_ie = YoutubeUserIE(youtube_ie)
2306 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2307 google_ie = GoogleIE()
2308 google_search_ie = GoogleSearchIE(google_ie)
2309 photobucket_ie = PhotobucketIE()
2310 yahoo_ie = YahooIE()
2311 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2312 generic_ie = GenericIE()
2315 fd = FileDownloader({
2316 'usenetrc': opts.usenetrc,
2317 'username': opts.username,
2318 'password': opts.password,
2319 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2320 'forceurl': opts.geturl,
2321 'forcetitle': opts.gettitle,
2322 'forcethumbnail': opts.getthumbnail,
2323 'forcedescription': opts.getdescription,
2324 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2325 'format': opts.format,
2326 'format_limit': opts.format_limit,
2327 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2328 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2329 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2330 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2331 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2332 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2333 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2334 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2335 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2336 or u'%(id)s.%(ext)s'),
2337 'ignoreerrors': opts.ignoreerrors,
2338 'ratelimit': opts.ratelimit,
2339 'nooverwrites': opts.nooverwrites,
2340 'retries': opts.retries,
2341 'continuedl': opts.continue_dl,
2342 'noprogress': opts.noprogress,
2343 'playliststart': opts.playliststart,
2344 'playlistend': opts.playlistend,
2345 'logtostderr': opts.outtmpl == '-',
2347 fd.add_info_extractor(youtube_search_ie)
2348 fd.add_info_extractor(youtube_pl_ie)
2349 fd.add_info_extractor(youtube_user_ie)
2350 fd.add_info_extractor(metacafe_ie)
2351 fd.add_info_extractor(dailymotion_ie)
2352 fd.add_info_extractor(youtube_ie)
2353 fd.add_info_extractor(google_ie)
2354 fd.add_info_extractor(google_search_ie)
2355 fd.add_info_extractor(photobucket_ie)
2356 fd.add_info_extractor(yahoo_ie)
2357 fd.add_info_extractor(yahoo_search_ie)
2359 # This must come last since it's the
2360 # fallback if none of the others work
2361 fd.add_info_extractor(generic_ie)
2364 if opts.update_self:
2365 update_self(fd, sys.argv[0])
2368 if len(all_urls) < 1:
2369 if not opts.update_self:
2370 parser.error(u'you must provide at least one URL')
2373 retcode = fd.download(all_urls)
2375 # Dump cookie jar if requested
2376 if opts.cookiefile is not None:
2379 except (IOError, OSError), err:
2380 sys.exit(u'ERROR: unable to save cookie jar')
2384 except DownloadError:
2386 except SameFileError:
2387 sys.exit(u'ERROR: fixed output name but more than one file to download')
2388 except KeyboardInterrupt:
2389 sys.exit(u'\nERROR: Interrupted by user')