2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
24 # parse_qs was moved from the cgi module to the urlparse module recently.
26 from urlparse import parse_qs
28 from cgi import parse_qs
31 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.11) Gecko/20101019 Firefox/3.6.11',
32 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
33 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
34 'Accept-Language': 'en-us,en;q=0.5',
37 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
39 def preferredencoding():
40 """Get preferred encoding.
42 Returns the best encoding scheme for the system, based on
43 locale.getpreferredencoding() and some further tweaks.
45 def yield_preferredencoding():
47 pref = locale.getpreferredencoding()
53 return yield_preferredencoding().next()
55 def htmlentity_transform(matchobj):
56 """Transforms an HTML entity to a Unicode character.
58 This function receives a match object and is intended to be used with
59 the re.sub() function.
61 entity = matchobj.group(1)
63 # Known non-numeric HTML entity
64 if entity in htmlentitydefs.name2codepoint:
65 return unichr(htmlentitydefs.name2codepoint[entity])
68 mobj = re.match(ur'(?u)#(x?\d+)', entity)
70 numstr = mobj.group(1)
71 if numstr.startswith(u'x'):
73 numstr = u'0%s' % numstr
76 return unichr(long(numstr, base))
78 # Unknown entity in name, return its literal representation
79 return (u'&%s;' % entity)
81 def sanitize_title(utitle):
82 """Sanitizes a video title so it could be used as part of a filename."""
83 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
84 return utitle.replace(unicode(os.sep), u'%')
86 def sanitize_open(filename, open_mode):
87 """Try to open the given filename, and slightly tweak it if this fails.
89 Attempts to open the given filename. If this fails, it tries to change
90 the filename slightly, step by step, until it's either able to open it
91 or it fails and raises a final exception, like the standard open()
94 It returns the tuple (stream, definitive_file_name).
98 if sys.platform == 'win32':
100 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
101 return (sys.stdout, filename)
102 stream = open(filename, open_mode)
103 return (stream, filename)
104 except (IOError, OSError), err:
105 # In case of error, try to remove win32 forbidden chars
106 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
108 # An exception here should be caught in the caller
109 stream = open(filename, open_mode)
110 return (stream, filename)
113 class DownloadError(Exception):
114 """Download Error exception.
116 This exception may be thrown by FileDownloader objects if they are not
117 configured to continue on errors. They will contain the appropriate
122 class SameFileError(Exception):
123 """Same File exception.
125 This exception will be thrown by FileDownloader objects if they detect
126 multiple files would have to be downloaded to the same file on disk.
130 class PostProcessingError(Exception):
131 """Post Processing exception.
133 This exception may be raised by PostProcessor's .run() method to
134 indicate an error in the postprocessing task.
138 class UnavailableVideoError(Exception):
139 """Unavailable Format exception.
141 This exception will be thrown when a video is requested
142 in a format that is not available for that video.
146 class ContentTooShortError(Exception):
147 """Content Too Short exception.
149 This exception may be raised by FileDownloader objects when a file they
150 download is too small for what the server announced first, indicating
151 the connection was probably interrupted.
157 def __init__(self, downloaded, expected):
158 self.downloaded = downloaded
159 self.expected = expected
161 class FileDownloader(object):
162 """File Downloader class.
164 File downloader objects are the ones responsible of downloading the
165 actual video file and writing it to disk if the user has requested
166 it, among some other tasks. In most cases there should be one per
167 program. As, given a video URL, the downloader doesn't know how to
168 extract all the needed information, task that InfoExtractors do, it
169 has to pass the URL to one of them.
171 For this, file downloader objects have a method that allows
172 InfoExtractors to be registered in a given order. When it is passed
173 a URL, the file downloader handles it to the first InfoExtractor it
174 finds that reports being able to handle it. The InfoExtractor extracts
175 all the information about the video or videos the URL refers to, and
176 asks the FileDownloader to process the video information, possibly
177 downloading the video.
179 File downloaders accept a lot of parameters. In order not to saturate
180 the object constructor with arguments, it receives a dictionary of
181 options instead. These options are available through the params
182 attribute for the InfoExtractors to use. The FileDownloader also
183 registers itself as the downloader in charge for the InfoExtractors
184 that are added to it, so this is a "mutual registration".
188 username: Username for authentication purposes.
189 password: Password for authentication purposes.
190 usenetrc: Use netrc for authentication instead.
191 quiet: Do not print messages to stdout.
192 forceurl: Force printing final URL.
193 forcetitle: Force printing title.
194 forcethumbnail: Force printing thumbnail URL.
195 forcedescription: Force printing description.
196 simulate: Do not download the video files.
197 format: Video format code.
198 format_limit: Highest quality format to try.
199 outtmpl: Template for output names.
200 ignoreerrors: Do not stop on download errors.
201 ratelimit: Download speed limit, in bytes/sec.
202 nooverwrites: Prevent overwriting files.
203 retries: Number of times to retry for HTTP error 5xx
204 continuedl: Try to continue downloads if possible.
205 noprogress: Do not print the progress bar.
206 playliststart: Playlist item to start at.
207 playlistend: Playlist item to end at.
208 logtostderr: Log messages to stderr instead of stdout.
214 _download_retcode = None
215 _num_downloads = None
218 def __init__(self, params):
219 """Create a FileDownloader object with the given options."""
222 self._download_retcode = 0
223 self._num_downloads = 0
224 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
228 def pmkdir(filename):
229 """Create directory components in filename. Similar to Unix "mkdir -p"."""
230 components = filename.split(os.sep)
231 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
232 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
233 for dir in aggregate:
234 if not os.path.exists(dir):
238 def format_bytes(bytes):
241 if type(bytes) is str:
246 exponent = long(math.log(bytes, 1024.0))
247 suffix = 'bkMGTPEZY'[exponent]
248 converted = float(bytes) / float(1024**exponent)
249 return '%.2f%s' % (converted, suffix)
252 def calc_percent(byte_counter, data_len):
255 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
258 def calc_eta(start, now, total, current):
262 if current == 0 or dif < 0.001: # One millisecond
264 rate = float(current) / dif
265 eta = long((float(total) - float(current)) / rate)
266 (eta_mins, eta_secs) = divmod(eta, 60)
269 return '%02d:%02d' % (eta_mins, eta_secs)
272 def calc_speed(start, now, bytes):
274 if bytes == 0 or dif < 0.001: # One millisecond
275 return '%10s' % '---b/s'
276 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
279 def best_block_size(elapsed_time, bytes):
280 new_min = max(bytes / 2.0, 1.0)
281 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
282 if elapsed_time < 0.001:
284 rate = bytes / elapsed_time
292 def parse_bytes(bytestr):
293 """Parse a string indicating a byte quantity into a long integer."""
294 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
297 number = float(matchobj.group(1))
298 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
299 return long(round(number * multiplier))
301 def add_info_extractor(self, ie):
302 """Add an InfoExtractor object to the end of the list."""
304 ie.set_downloader(self)
306 def add_post_processor(self, pp):
307 """Add a PostProcessor object to the end of the chain."""
309 pp.set_downloader(self)
311 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
312 """Print message to stdout if not in quiet mode."""
314 if not self.params.get('quiet', False):
315 terminator = [u'\n', u''][skip_eol]
316 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
317 self._screen_file.flush()
318 except (UnicodeEncodeError), err:
319 if not ignore_encoding_errors:
322 def to_stderr(self, message):
323 """Print message to stderr."""
324 print >>sys.stderr, message.encode(preferredencoding())
326 def fixed_template(self):
327 """Checks if the output template is fixed."""
328 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
330 def trouble(self, message=None):
331 """Determine action to take when a download problem appears.
333 Depending on if the downloader has been configured to ignore
334 download errors or not, this method may throw an exception or
335 not when errors are found, after printing the message.
337 if message is not None:
338 self.to_stderr(message)
339 if not self.params.get('ignoreerrors', False):
340 raise DownloadError(message)
341 self._download_retcode = 1
343 def slow_down(self, start_time, byte_counter):
344 """Sleep if the download speed is over the rate limit."""
345 rate_limit = self.params.get('ratelimit', None)
346 if rate_limit is None or byte_counter == 0:
349 elapsed = now - start_time
352 speed = float(byte_counter) / elapsed
353 if speed > rate_limit:
354 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
356 def report_destination(self, filename):
357 """Report destination filename."""
358 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
360 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
361 """Report download progress."""
362 if self.params.get('noprogress', False):
364 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
365 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
367 def report_resuming_byte(self, resume_len):
368 """Report attempt to resume at given byte."""
369 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
371 def report_retry(self, count, retries):
372 """Report retry in case of HTTP error 5xx"""
373 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
375 def report_file_already_downloaded(self, file_name):
376 """Report file has already been fully downloaded."""
378 self.to_screen(u'[download] %s has already been downloaded' % file_name)
379 except (UnicodeEncodeError), err:
380 self.to_screen(u'[download] The file has already been downloaded')
382 def report_unable_to_resume(self):
383 """Report it was impossible to resume download."""
384 self.to_screen(u'[download] Unable to resume')
386 def report_finish(self):
387 """Report download finished."""
388 if self.params.get('noprogress', False):
389 self.to_screen(u'[download] Download completed')
393 def increment_downloads(self):
394 """Increment the ordinal that assigns a number to each file."""
395 self._num_downloads += 1
397 def process_info(self, info_dict):
398 """Process a single dictionary returned by an InfoExtractor."""
399 # Do nothing else if in simulate mode
400 if self.params.get('simulate', False):
402 if self.params.get('forcetitle', False):
403 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
404 if self.params.get('forceurl', False):
405 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
406 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
407 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
408 if self.params.get('forcedescription', False) and 'description' in info_dict:
409 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
414 template_dict = dict(info_dict)
415 template_dict['epoch'] = unicode(long(time.time()))
416 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
417 filename = self.params['outtmpl'] % template_dict
418 except (ValueError, KeyError), err:
419 self.trouble(u'ERROR: invalid system charset or erroneous output template')
421 if self.params.get('nooverwrites', False) and os.path.exists(filename):
422 self.to_stderr(u'WARNING: file exists and will be skipped')
426 self.pmkdir(filename)
427 except (OSError, IOError), err:
428 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
432 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
433 except (OSError, IOError), err:
434 raise UnavailableVideoError
435 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
436 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
438 except (ContentTooShortError, ), err:
439 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
444 self.post_process(filename, info_dict)
445 except (PostProcessingError), err:
446 self.trouble(u'ERROR: postprocessing: %s' % str(err))
449 def download(self, url_list):
450 """Download a given list of URLs."""
451 if len(url_list) > 1 and self.fixed_template():
452 raise SameFileError(self.params['outtmpl'])
455 suitable_found = False
457 # Go to next InfoExtractor if not suitable
458 if not ie.suitable(url):
461 # Suitable InfoExtractor found
462 suitable_found = True
464 # Extract information from URL and process it
467 # Suitable InfoExtractor had been found; go to next URL
470 if not suitable_found:
471 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
473 return self._download_retcode
475 def post_process(self, filename, ie_info):
476 """Run the postprocessing chain on the given file."""
478 info['filepath'] = filename
484 def _download_with_rtmpdump(self, filename, url, player_url):
485 self.report_destination(filename)
487 # Check for rtmpdump first
489 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
490 except (OSError, IOError):
491 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
494 # Download using rtmpdump. rtmpdump returns exit code 2 when
495 # the connection was interrumpted and resuming appears to be
496 # possible. This is part of rtmpdump's normal usage, AFAIK.
497 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
498 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
499 while retval == 2 or retval == 1:
500 prevsize = os.path.getsize(filename)
501 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
502 time.sleep(5.0) # This seems to be needed
503 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
504 cursize = os.path.getsize(filename)
505 if prevsize == cursize and retval == 1:
508 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
511 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
514 def _do_download(self, filename, url, player_url):
515 # Attempt to download using rtmpdump
516 if url.startswith('rtmp'):
517 return self._download_with_rtmpdump(filename, url, player_url)
521 basic_request = urllib2.Request(url, None, std_headers)
522 request = urllib2.Request(url, None, std_headers)
524 # Establish possible resume length
525 if os.path.isfile(filename):
526 resume_len = os.path.getsize(filename)
530 # Request parameters in case of being able to resume
531 if self.params.get('continuedl', False) and resume_len != 0:
532 self.report_resuming_byte(resume_len)
533 request.add_header('Range','bytes=%d-' % resume_len)
537 retries = self.params.get('retries', 0)
538 while count <= retries:
539 # Establish connection
541 data = urllib2.urlopen(request)
543 except (urllib2.HTTPError, ), err:
544 if (err.code < 500 or err.code >= 600) and err.code != 416:
545 # Unexpected HTTP error
547 elif err.code == 416:
548 # Unable to resume (requested range not satisfiable)
550 # Open the connection again without the range header
551 data = urllib2.urlopen(basic_request)
552 content_length = data.info()['Content-Length']
553 except (urllib2.HTTPError, ), err:
554 if err.code < 500 or err.code >= 600:
557 # Examine the reported length
558 if (content_length is not None and
559 (resume_len - 100 < long(content_length) < resume_len + 100)):
560 # The file had already been fully downloaded.
561 # Explanation to the above condition: in issue #175 it was revealed that
562 # YouTube sometimes adds or removes a few bytes from the end of the file,
563 # changing the file size slightly and causing problems for some users. So
564 # I decided to implement a suggested change and consider the file
565 # completely downloaded if the file size differs less than 100 bytes from
566 # the one in the hard drive.
567 self.report_file_already_downloaded(filename)
570 # The length does not match, we start the download over
571 self.report_unable_to_resume()
577 self.report_retry(count, retries)
580 self.trouble(u'ERROR: giving up after %s retries' % retries)
583 data_len = data.info().get('Content-length', None)
584 data_len_str = self.format_bytes(data_len)
591 data_block = data.read(block_size)
593 data_block_len = len(data_block)
594 if data_block_len == 0:
596 byte_counter += data_block_len
598 # Open file just in time
601 (stream, filename) = sanitize_open(filename, open_mode)
602 self.report_destination(filename)
603 except (OSError, IOError), err:
604 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
607 stream.write(data_block)
608 except (IOError, OSError), err:
609 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
611 block_size = self.best_block_size(after - before, data_block_len)
614 percent_str = self.calc_percent(byte_counter, data_len)
615 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
616 speed_str = self.calc_speed(start, time.time(), byte_counter)
617 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
620 self.slow_down(start, byte_counter)
623 if data_len is not None and str(byte_counter) != data_len:
624 raise ContentTooShortError(byte_counter, long(data_len))
627 class InfoExtractor(object):
628 """Information Extractor class.
630 Information extractors are the classes that, given a URL, extract
631 information from the video (or videos) the URL refers to. This
632 information includes the real video URL, the video title and simplified
633 title, author and others. The information is stored in a dictionary
634 which is then passed to the FileDownloader. The FileDownloader
635 processes this information possibly downloading the video to the file
636 system, among other possible outcomes. The dictionaries must include
637 the following fields:
639 id: Video identifier.
640 url: Final video URL.
641 uploader: Nickname of the video uploader.
642 title: Literal title.
643 stitle: Simplified title.
644 ext: Video filename extension.
645 format: Video format.
646 player_url: SWF Player URL (may be None).
648 The following fields are optional. Their primary purpose is to allow
649 youtube-dl to serve as the backend for a video search function, such
650 as the one in youtube2mp3. They are only used when their respective
651 forced printing functions are called:
653 thumbnail: Full URL to a video thumbnail image.
654 description: One-line video description.
656 Subclasses of this one should re-define the _real_initialize() and
657 _real_extract() methods, as well as the suitable() static method.
658 Probably, they should also be instantiated and added to the main
665 def __init__(self, downloader=None):
666 """Constructor. Receives an optional downloader."""
668 self.set_downloader(downloader)
672 """Receives a URL and returns True if suitable for this IE."""
675 def initialize(self):
676 """Initializes an instance (authentication, etc)."""
678 self._real_initialize()
681 def extract(self, url):
682 """Extracts URL information and returns it in list of dicts."""
684 return self._real_extract(url)
686 def set_downloader(self, downloader):
687 """Sets the downloader for this IE."""
688 self._downloader = downloader
690 def _real_initialize(self):
691 """Real initialization process. Redefine in subclasses."""
694 def _real_extract(self, url):
695 """Real extraction process. Redefine in subclasses."""
698 class YoutubeIE(InfoExtractor):
699 """Information extractor for youtube.com."""
701 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
702 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
703 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
704 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
705 _NETRC_MACHINE = 'youtube'
706 # Listed in order of quality
707 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
708 _video_extensions = {
714 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
721 return (re.match(YoutubeIE._VALID_URL, url) is not None)
723 def report_lang(self):
724 """Report attempt to set language."""
725 self._downloader.to_screen(u'[youtube] Setting language')
727 def report_login(self):
728 """Report attempt to log in."""
729 self._downloader.to_screen(u'[youtube] Logging in')
731 def report_age_confirmation(self):
732 """Report attempt to confirm age."""
733 self._downloader.to_screen(u'[youtube] Confirming age')
735 def report_video_webpage_download(self, video_id):
736 """Report attempt to download video webpage."""
737 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
739 def report_video_info_webpage_download(self, video_id):
740 """Report attempt to download video info webpage."""
741 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
743 def report_information_extraction(self, video_id):
744 """Report attempt to extract video information."""
745 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
747 def report_unavailable_format(self, video_id, format):
748 """Report extracted video URL."""
749 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
751 def report_rtmp_download(self):
752 """Indicate the download will use the RTMP protocol."""
753 self._downloader.to_screen(u'[youtube] RTMP download detected')
755 def _real_initialize(self):
756 if self._downloader is None:
761 downloader_params = self._downloader.params
763 # Attempt to use provided username and password or .netrc data
764 if downloader_params.get('username', None) is not None:
765 username = downloader_params['username']
766 password = downloader_params['password']
767 elif downloader_params.get('usenetrc', False):
769 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
774 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
775 except (IOError, netrc.NetrcParseError), err:
776 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
780 request = urllib2.Request(self._LANG_URL, None, std_headers)
783 urllib2.urlopen(request).read()
784 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
785 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
788 # No authentication to be performed
794 'current_form': 'loginForm',
796 'action_login': 'Log In',
797 'username': username,
798 'password': password,
800 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
803 login_results = urllib2.urlopen(request).read()
804 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
805 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
808 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
814 'action_confirm': 'Confirm',
816 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
818 self.report_age_confirmation()
819 age_results = urllib2.urlopen(request).read()
820 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
821 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
824 def _real_extract(self, url):
825 # Extract video id from URL
826 mobj = re.match(self._VALID_URL, url)
828 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
830 video_id = mobj.group(2)
833 self.report_video_webpage_download(video_id)
834 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
836 video_webpage = urllib2.urlopen(request).read()
837 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
838 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
841 # Attempt to extract SWF player URL
842 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
844 player_url = mobj.group(1)
849 self.report_video_info_webpage_download(video_id)
850 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
851 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
852 % (video_id, el_type))
853 request = urllib2.Request(video_info_url, None, std_headers)
855 video_info_webpage = urllib2.urlopen(request).read()
856 video_info = parse_qs(video_info_webpage)
857 if 'token' in video_info:
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
862 if 'token' not in video_info:
863 if 'reason' in video_info:
864 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
866 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
869 # Start extracting information
870 self.report_information_extraction(video_id)
873 if 'author' not in video_info:
874 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
876 video_uploader = urllib.unquote_plus(video_info['author'][0])
879 if 'title' not in video_info:
880 self._downloader.trouble(u'ERROR: unable to extract video title')
882 video_title = urllib.unquote_plus(video_info['title'][0])
883 video_title = video_title.decode('utf-8')
884 video_title = sanitize_title(video_title)
887 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
888 simple_title = simple_title.strip(ur'_')
891 if 'thumbnail_url' not in video_info:
892 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
894 else: # don't panic if we can't find it
895 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
898 video_description = 'No description available.'
899 if self._downloader.params.get('forcedescription', False):
900 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
902 video_description = mobj.group(1)
905 video_token = urllib.unquote_plus(video_info['token'][0])
907 # Decide which formats to download
908 requested_format = self._downloader.params.get('format', None)
909 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
911 if 'fmt_url_map' in video_info:
912 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
913 format_limit = self._downloader.params.get('format_limit', None)
914 if format_limit is not None and format_limit in self._available_formats:
915 format_list = self._available_formats[self._available_formats.index(format_limit):]
917 format_list = self._available_formats
918 existing_formats = [x for x in format_list if x in url_map]
919 if len(existing_formats) == 0:
920 self._downloader.trouble(u'ERROR: no known formats available for video')
922 if requested_format is None:
923 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
924 elif requested_format == '-1':
925 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
927 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
929 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
930 self.report_rtmp_download()
931 video_url_list = [(None, video_info['conn'][0])]
934 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
937 for format_param, video_real_url in video_url_list:
938 # At this point we have a new video
939 self._downloader.increment_downloads()
942 video_extension = self._video_extensions.get(format_param, 'flv')
944 # Find the video URL in fmt_url_map or conn paramters
946 # Process video information
947 self._downloader.process_info({
948 'id': video_id.decode('utf-8'),
949 'url': video_real_url.decode('utf-8'),
950 'uploader': video_uploader.decode('utf-8'),
951 'title': video_title,
952 'stitle': simple_title,
953 'ext': video_extension.decode('utf-8'),
954 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description.decode('utf-8'),
957 'player_url': player_url,
959 except UnavailableVideoError, err:
960 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
963 class MetacafeIE(InfoExtractor):
964 """Information Extractor for metacafe.com."""
966 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
967 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
968 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
971 def __init__(self, youtube_ie, downloader=None):
972 InfoExtractor.__init__(self, downloader)
973 self._youtube_ie = youtube_ie
977 return (re.match(MetacafeIE._VALID_URL, url) is not None)
979 def report_disclaimer(self):
980 """Report disclaimer retrieval."""
981 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
983 def report_age_confirmation(self):
984 """Report attempt to confirm age."""
985 self._downloader.to_screen(u'[metacafe] Confirming age')
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
995 def _real_initialize(self):
996 # Retrieve disclaimer
997 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
999 self.report_disclaimer()
1000 disclaimer = urllib2.urlopen(request).read()
1001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1008 'submit': "Continue - I'm over 18",
1010 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1012 self.report_age_confirmation()
1013 disclaimer = urllib2.urlopen(request).read()
1014 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1015 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1018 def _real_extract(self, url):
1019 # Extract id and simplified title from URL
1020 mobj = re.match(self._VALID_URL, url)
1022 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1025 video_id = mobj.group(1)
1027 # Check if video comes from YouTube
1028 mobj2 = re.match(r'^yt-(.*)$', video_id)
1029 if mobj2 is not None:
1030 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1033 # At this point we have a new video
1034 self._downloader.increment_downloads()
1036 simple_title = mobj.group(2).decode('utf-8')
1038 # Retrieve video webpage to extract further information
1039 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1041 self.report_download_webpage(video_id)
1042 webpage = urllib2.urlopen(request).read()
1043 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1044 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1047 # Extract URL, uploader and title from webpage
1048 self.report_extraction(video_id)
1049 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1050 if mobj is not None:
1051 mediaURL = urllib.unquote(mobj.group(1))
1052 video_extension = mediaURL[-3:]
1054 # Extract gdaKey if available
1055 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1057 video_url = mediaURL
1059 gdaKey = mobj.group(1)
1060 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1062 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1064 self._downloader.trouble(u'ERROR: unable to extract media URL')
1066 vardict = parse_qs(mobj.group(1))
1067 if 'mediaData' not in vardict:
1068 self._downloader.trouble(u'ERROR: unable to extract media URL')
1070 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1072 self._downloader.trouble(u'ERROR: unable to extract media URL')
1074 mediaURL = mobj.group(1).replace('\\/', '/')
1075 video_extension = mediaURL[-3:]
1076 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1078 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1080 self._downloader.trouble(u'ERROR: unable to extract title')
1082 video_title = mobj.group(1).decode('utf-8')
1083 video_title = sanitize_title(video_title)
1085 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1087 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1089 video_uploader = mobj.group(1)
1092 # Process video information
1093 self._downloader.process_info({
1094 'id': video_id.decode('utf-8'),
1095 'url': video_url.decode('utf-8'),
1096 'uploader': video_uploader.decode('utf-8'),
1097 'title': video_title,
1098 'stitle': simple_title,
1099 'ext': video_extension.decode('utf-8'),
1103 except UnavailableVideoError:
1104 self._downloader.trouble(u'ERROR: unable to download video')
1107 class DailymotionIE(InfoExtractor):
1108 """Information Extractor for Dailymotion"""
1110 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1112 def __init__(self, downloader=None):
1113 InfoExtractor.__init__(self, downloader)
1117 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1119 def report_download_webpage(self, video_id):
1120 """Report webpage download."""
1121 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1123 def report_extraction(self, video_id):
1124 """Report information extraction."""
1125 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1127 def _real_initialize(self):
1130 def _real_extract(self, url):
1131 # Extract id and simplified title from URL
1132 mobj = re.match(self._VALID_URL, url)
1134 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1137 # At this point we have a new video
1138 self._downloader.increment_downloads()
1139 video_id = mobj.group(1)
1141 simple_title = mobj.group(2).decode('utf-8')
1142 video_extension = 'flv'
1144 # Retrieve video webpage to extract further information
1145 request = urllib2.Request(url)
1147 self.report_download_webpage(video_id)
1148 webpage = urllib2.urlopen(request).read()
1149 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1150 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1153 # Extract URL, uploader and title from webpage
1154 self.report_extraction(video_id)
1155 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1157 self._downloader.trouble(u'ERROR: unable to extract media URL')
1159 mediaURL = urllib.unquote(mobj.group(1))
1161 # if needed add http://www.dailymotion.com/ if relative URL
1163 video_url = mediaURL
1165 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1166 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1168 self._downloader.trouble(u'ERROR: unable to extract title')
1170 video_title = mobj.group(1).decode('utf-8')
1171 video_title = sanitize_title(video_title)
1173 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1175 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1177 video_uploader = mobj.group(1)
1180 # Process video information
1181 self._downloader.process_info({
1182 'id': video_id.decode('utf-8'),
1183 'url': video_url.decode('utf-8'),
1184 'uploader': video_uploader.decode('utf-8'),
1185 'title': video_title,
1186 'stitle': simple_title,
1187 'ext': video_extension.decode('utf-8'),
1191 except UnavailableVideoError:
1192 self._downloader.trouble(u'ERROR: unable to download video')
1194 class GoogleIE(InfoExtractor):
1195 """Information extractor for video.google.com."""
1197 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1204 return (re.match(GoogleIE._VALID_URL, url) is not None)
1206 def report_download_webpage(self, video_id):
1207 """Report webpage download."""
1208 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1210 def report_extraction(self, video_id):
1211 """Report information extraction."""
1212 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1214 def _real_initialize(self):
1217 def _real_extract(self, url):
1218 # Extract id from URL
1219 mobj = re.match(self._VALID_URL, url)
1221 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1224 # At this point we have a new video
1225 self._downloader.increment_downloads()
1226 video_id = mobj.group(1)
1228 video_extension = 'mp4'
1230 # Retrieve video webpage to extract further information
1231 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1233 self.report_download_webpage(video_id)
1234 webpage = urllib2.urlopen(request).read()
1235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1236 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1239 # Extract URL, uploader, and title from webpage
1240 self.report_extraction(video_id)
1241 mobj = re.search(r"download_url:'([^']+)'", webpage)
1243 video_extension = 'flv'
1244 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1246 self._downloader.trouble(u'ERROR: unable to extract media URL')
1248 mediaURL = urllib.unquote(mobj.group(1))
1249 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1250 mediaURL = mediaURL.replace('\\x26', '\x26')
1252 video_url = mediaURL
1254 mobj = re.search(r'<title>(.*)</title>', webpage)
1256 self._downloader.trouble(u'ERROR: unable to extract title')
1258 video_title = mobj.group(1).decode('utf-8')
1259 video_title = sanitize_title(video_title)
1260 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1262 # Extract video description
1263 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1265 self._downloader.trouble(u'ERROR: unable to extract video description')
1267 video_description = mobj.group(1).decode('utf-8')
1268 if not video_description:
1269 video_description = 'No description available.'
1271 # Extract video thumbnail
1272 if self._downloader.params.get('forcethumbnail', False):
1273 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1275 webpage = urllib2.urlopen(request).read()
1276 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1277 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1279 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1281 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1283 video_thumbnail = mobj.group(1)
1284 else: # we need something to pass to process_info
1285 video_thumbnail = ''
1289 # Process video information
1290 self._downloader.process_info({
1291 'id': video_id.decode('utf-8'),
1292 'url': video_url.decode('utf-8'),
1294 'title': video_title,
1295 'stitle': simple_title,
1296 'ext': video_extension.decode('utf-8'),
1300 except UnavailableVideoError:
1301 self._downloader.trouble(u'ERROR: unable to download video')
1304 class PhotobucketIE(InfoExtractor):
1305 """Information extractor for photobucket.com."""
1307 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1309 def __init__(self, downloader=None):
1310 InfoExtractor.__init__(self, downloader)
1314 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1316 def report_download_webpage(self, video_id):
1317 """Report webpage download."""
1318 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1320 def report_extraction(self, video_id):
1321 """Report information extraction."""
1322 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1324 def _real_initialize(self):
1327 def _real_extract(self, url):
1328 # Extract id from URL
1329 mobj = re.match(self._VALID_URL, url)
1331 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1334 # At this point we have a new video
1335 self._downloader.increment_downloads()
1336 video_id = mobj.group(1)
1338 video_extension = 'flv'
1340 # Retrieve video webpage to extract further information
1341 request = urllib2.Request(url)
1343 self.report_download_webpage(video_id)
1344 webpage = urllib2.urlopen(request).read()
1345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1346 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1349 # Extract URL, uploader, and title from webpage
1350 self.report_extraction(video_id)
1351 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1353 self._downloader.trouble(u'ERROR: unable to extract media URL')
1355 mediaURL = urllib.unquote(mobj.group(1))
1357 video_url = mediaURL
1359 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1361 self._downloader.trouble(u'ERROR: unable to extract title')
1363 video_title = mobj.group(1).decode('utf-8')
1364 video_title = sanitize_title(video_title)
1365 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1367 video_uploader = mobj.group(2).decode('utf-8')
1370 # Process video information
1371 self._downloader.process_info({
1372 'id': video_id.decode('utf-8'),
1373 'url': video_url.decode('utf-8'),
1374 'uploader': video_uploader,
1375 'title': video_title,
1376 'stitle': simple_title,
1377 'ext': video_extension.decode('utf-8'),
1381 except UnavailableVideoError:
1382 self._downloader.trouble(u'ERROR: unable to download video')
1385 class YahooIE(InfoExtractor):
1386 """Information extractor for video.yahoo.com."""
1388 # _VALID_URL matches all Yahoo! Video URLs
1389 # _VPAGE_URL matches only the extractable '/watch/' URLs
1390 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1391 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1393 def __init__(self, downloader=None):
1394 InfoExtractor.__init__(self, downloader)
1398 return (re.match(YahooIE._VALID_URL, url) is not None)
1400 def report_download_webpage(self, video_id):
1401 """Report webpage download."""
1402 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1404 def report_extraction(self, video_id):
1405 """Report information extraction."""
1406 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1408 def _real_initialize(self):
1411 def _real_extract(self, url, new_video=True):
1412 # Extract ID from URL
1413 mobj = re.match(self._VALID_URL, url)
1415 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1418 # At this point we have a new video
1419 self._downloader.increment_downloads()
1420 video_id = mobj.group(2)
1421 video_extension = 'flv'
1423 # Rewrite valid but non-extractable URLs as
1424 # extractable English language /watch/ URLs
1425 if re.match(self._VPAGE_URL, url) is None:
1426 request = urllib2.Request(url)
1428 webpage = urllib2.urlopen(request).read()
1429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1430 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1433 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1435 self._downloader.trouble(u'ERROR: Unable to extract id field')
1437 yahoo_id = mobj.group(1)
1439 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1441 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1443 yahoo_vid = mobj.group(1)
1445 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1446 return self._real_extract(url, new_video=False)
1448 # Retrieve video webpage to extract further information
1449 request = urllib2.Request(url)
1451 self.report_download_webpage(video_id)
1452 webpage = urllib2.urlopen(request).read()
1453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1454 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1457 # Extract uploader and title from webpage
1458 self.report_extraction(video_id)
1459 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1461 self._downloader.trouble(u'ERROR: unable to extract video title')
1463 video_title = mobj.group(1).decode('utf-8')
1464 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1466 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1468 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1470 video_uploader = mobj.group(1).decode('utf-8')
1472 # Extract video thumbnail
1473 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1475 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1477 video_thumbnail = mobj.group(1).decode('utf-8')
1479 # Extract video description
1480 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1482 self._downloader.trouble(u'ERROR: unable to extract video description')
1484 video_description = mobj.group(1).decode('utf-8')
1485 if not video_description: video_description = 'No description available.'
1487 # Extract video height and width
1488 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1490 self._downloader.trouble(u'ERROR: unable to extract video height')
1492 yv_video_height = mobj.group(1)
1494 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1496 self._downloader.trouble(u'ERROR: unable to extract video width')
1498 yv_video_width = mobj.group(1)
1500 # Retrieve video playlist to extract media URL
1501 # I'm not completely sure what all these options are, but we
1502 # seem to need most of them, otherwise the server sends a 401.
1503 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1504 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1505 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1506 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1507 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1509 self.report_download_webpage(video_id)
1510 webpage = urllib2.urlopen(request).read()
1511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1512 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1515 # Extract media URL from playlist XML
1516 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1518 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1520 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1521 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1524 # Process video information
1525 self._downloader.process_info({
1526 'id': video_id.decode('utf-8'),
1528 'uploader': video_uploader,
1529 'title': video_title,
1530 'stitle': simple_title,
1531 'ext': video_extension.decode('utf-8'),
1532 'thumbnail': video_thumbnail.decode('utf-8'),
1533 'description': video_description,
1534 'thumbnail': video_thumbnail,
1535 'description': video_description,
1538 except UnavailableVideoError:
1539 self._downloader.trouble(u'ERROR: unable to download video')
1542 class GenericIE(InfoExtractor):
1543 """Generic last-resort information extractor."""
1545 def __init__(self, downloader=None):
1546 InfoExtractor.__init__(self, downloader)
1552 def report_download_webpage(self, video_id):
1553 """Report webpage download."""
1554 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1555 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1557 def report_extraction(self, video_id):
1558 """Report information extraction."""
1559 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1561 def _real_initialize(self):
1564 def _real_extract(self, url):
1565 # At this point we have a new video
1566 self._downloader.increment_downloads()
1568 video_id = url.split('/')[-1]
1569 request = urllib2.Request(url)
1571 self.report_download_webpage(video_id)
1572 webpage = urllib2.urlopen(request).read()
1573 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1574 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1576 except ValueError, err:
1577 # since this is the last-resort InfoExtractor, if
1578 # this error is thrown, it'll be thrown here
1579 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1582 # Start with something easy: JW Player in SWFObject
1583 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1585 # Broaden the search a little bit
1586 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1588 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1591 # It's possible that one of the regexes
1592 # matched, but returned an empty group:
1593 if mobj.group(1) is None:
1594 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1597 video_url = urllib.unquote(mobj.group(1))
1598 video_id = os.path.basename(video_url)
1600 # here's a fun little line of code for you:
1601 video_extension = os.path.splitext(video_id)[1][1:]
1602 video_id = os.path.splitext(video_id)[0]
1604 # it's tempting to parse this further, but you would
1605 # have to take into account all the variations like
1606 # Video Title - Site Name
1607 # Site Name | Video Title
1608 # Video Title - Tagline | Site Name
1609 # and so on and so forth; it's just not practical
1610 mobj = re.search(r'<title>(.*)</title>', webpage)
1612 self._downloader.trouble(u'ERROR: unable to extract title')
1614 video_title = mobj.group(1).decode('utf-8')
1615 video_title = sanitize_title(video_title)
1616 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1618 # video uploader is domain name
1619 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1621 self._downloader.trouble(u'ERROR: unable to extract title')
1623 video_uploader = mobj.group(1).decode('utf-8')
1626 # Process video information
1627 self._downloader.process_info({
1628 'id': video_id.decode('utf-8'),
1629 'url': video_url.decode('utf-8'),
1630 'uploader': video_uploader,
1631 'title': video_title,
1632 'stitle': simple_title,
1633 'ext': video_extension.decode('utf-8'),
1637 except UnavailableVideoError, err:
1638 self._downloader.trouble(u'ERROR: unable to download video')
1641 class YoutubeSearchIE(InfoExtractor):
1642 """Information Extractor for YouTube search queries."""
1643 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1644 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1645 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1646 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1648 _max_youtube_results = 1000
1650 def __init__(self, youtube_ie, downloader=None):
1651 InfoExtractor.__init__(self, downloader)
1652 self._youtube_ie = youtube_ie
1656 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1658 def report_download_page(self, query, pagenum):
1659 """Report attempt to download playlist page with given number."""
1660 query = query.decode(preferredencoding())
1661 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1663 def _real_initialize(self):
1664 self._youtube_ie.initialize()
1666 def _real_extract(self, query):
1667 mobj = re.match(self._VALID_QUERY, query)
1669 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1672 prefix, query = query.split(':')
1674 query = query.encode('utf-8')
1676 self._download_n_results(query, 1)
1678 elif prefix == 'all':
1679 self._download_n_results(query, self._max_youtube_results)
1685 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1687 elif n > self._max_youtube_results:
1688 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1689 n = self._max_youtube_results
1690 self._download_n_results(query, n)
1692 except ValueError: # parsing prefix as integer fails
1693 self._download_n_results(query, 1)
1696 def _download_n_results(self, query, n):
1697 """Downloads a specified number of results for a query"""
1700 already_seen = set()
1704 self.report_download_page(query, pagenum)
1705 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1706 request = urllib2.Request(result_url, None, std_headers)
1708 page = urllib2.urlopen(request).read()
1709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1713 # Extract video identifiers
1714 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1715 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1716 if video_id not in already_seen:
1717 video_ids.append(video_id)
1718 already_seen.add(video_id)
1719 if len(video_ids) == n:
1720 # Specified n videos reached
1721 for id in video_ids:
1722 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1725 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1726 for id in video_ids:
1727 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1730 pagenum = pagenum + 1
1732 class GoogleSearchIE(InfoExtractor):
1733 """Information Extractor for Google Video search queries."""
1734 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1735 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1736 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1737 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1739 _max_google_results = 1000
1741 def __init__(self, google_ie, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1743 self._google_ie = google_ie
1747 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1749 def report_download_page(self, query, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 query = query.decode(preferredencoding())
1752 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1754 def _real_initialize(self):
1755 self._google_ie.initialize()
1757 def _real_extract(self, query):
1758 mobj = re.match(self._VALID_QUERY, query)
1760 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1763 prefix, query = query.split(':')
1765 query = query.encode('utf-8')
1767 self._download_n_results(query, 1)
1769 elif prefix == 'all':
1770 self._download_n_results(query, self._max_google_results)
1776 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1778 elif n > self._max_google_results:
1779 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1780 n = self._max_google_results
1781 self._download_n_results(query, n)
1783 except ValueError: # parsing prefix as integer fails
1784 self._download_n_results(query, 1)
1787 def _download_n_results(self, query, n):
1788 """Downloads a specified number of results for a query"""
1791 already_seen = set()
1795 self.report_download_page(query, pagenum)
1796 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1797 request = urllib2.Request(result_url, None, std_headers)
1799 page = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1804 # Extract video identifiers
1805 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806 video_id = mobj.group(1)
1807 if video_id not in already_seen:
1808 video_ids.append(video_id)
1809 already_seen.add(video_id)
1810 if len(video_ids) == n:
1811 # Specified n videos reached
1812 for id in video_ids:
1813 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1816 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1817 for id in video_ids:
1818 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1821 pagenum = pagenum + 1
1823 class YahooSearchIE(InfoExtractor):
1824 """Information Extractor for Yahoo! Video search queries."""
1825 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1826 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1827 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1828 _MORE_PAGES_INDICATOR = r'\s*Next'
1830 _max_yahoo_results = 1000
1832 def __init__(self, yahoo_ie, downloader=None):
1833 InfoExtractor.__init__(self, downloader)
1834 self._yahoo_ie = yahoo_ie
1838 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1840 def report_download_page(self, query, pagenum):
1841 """Report attempt to download playlist page with given number."""
1842 query = query.decode(preferredencoding())
1843 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1845 def _real_initialize(self):
1846 self._yahoo_ie.initialize()
1848 def _real_extract(self, query):
1849 mobj = re.match(self._VALID_QUERY, query)
1851 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1854 prefix, query = query.split(':')
1856 query = query.encode('utf-8')
1858 self._download_n_results(query, 1)
1860 elif prefix == 'all':
1861 self._download_n_results(query, self._max_yahoo_results)
1867 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1869 elif n > self._max_yahoo_results:
1870 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1871 n = self._max_yahoo_results
1872 self._download_n_results(query, n)
1874 except ValueError: # parsing prefix as integer fails
1875 self._download_n_results(query, 1)
1878 def _download_n_results(self, query, n):
1879 """Downloads a specified number of results for a query"""
1882 already_seen = set()
1886 self.report_download_page(query, pagenum)
1887 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1888 request = urllib2.Request(result_url, None, std_headers)
1890 page = urllib2.urlopen(request).read()
1891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1892 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1895 # Extract video identifiers
1896 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1897 video_id = mobj.group(1)
1898 if video_id not in already_seen:
1899 video_ids.append(video_id)
1900 already_seen.add(video_id)
1901 if len(video_ids) == n:
1902 # Specified n videos reached
1903 for id in video_ids:
1904 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1907 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1908 for id in video_ids:
1909 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1912 pagenum = pagenum + 1
1914 class YoutubePlaylistIE(InfoExtractor):
1915 """Information Extractor for YouTube playlists."""
1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1918 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1919 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1920 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1923 def __init__(self, youtube_ie, downloader=None):
1924 InfoExtractor.__init__(self, downloader)
1925 self._youtube_ie = youtube_ie
1929 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1931 def report_download_page(self, playlist_id, pagenum):
1932 """Report attempt to download playlist page with given number."""
1933 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1935 def _real_initialize(self):
1936 self._youtube_ie.initialize()
1938 def _real_extract(self, url):
1939 # Extract playlist id
1940 mobj = re.match(self._VALID_URL, url)
1942 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1945 # Download playlist pages
1946 playlist_id = mobj.group(1)
1951 self.report_download_page(playlist_id, pagenum)
1952 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1954 page = urllib2.urlopen(request).read()
1955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1956 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1959 # Extract video identifiers
1961 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1962 if mobj.group(1) not in ids_in_page:
1963 ids_in_page.append(mobj.group(1))
1964 video_ids.extend(ids_in_page)
1966 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1968 pagenum = pagenum + 1
1970 playliststart = self._downloader.params.get('playliststart', 1) - 1
1971 playlistend = self._downloader.params.get('playlistend', -1)
1972 video_ids = video_ids[playliststart:playlistend]
1974 for id in video_ids:
1975 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1978 class YoutubeUserIE(InfoExtractor):
1979 """Information Extractor for YouTube users."""
1981 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1982 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1983 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1986 def __init__(self, youtube_ie, downloader=None):
1987 InfoExtractor.__init__(self, downloader)
1988 self._youtube_ie = youtube_ie
1992 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1994 def report_download_page(self, username):
1995 """Report attempt to download user page."""
1996 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
1998 def _real_initialize(self):
1999 self._youtube_ie.initialize()
2001 def _real_extract(self, url):
2003 mobj = re.match(self._VALID_URL, url)
2005 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2008 # Download user page
2009 username = mobj.group(1)
2013 self.report_download_page(username)
2014 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2016 page = urllib2.urlopen(request).read()
2017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2018 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2021 # Extract video identifiers
2024 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2025 if mobj.group(1) not in ids_in_page:
2026 ids_in_page.append(mobj.group(1))
2027 video_ids.extend(ids_in_page)
2029 playliststart = self._downloader.params.get('playliststart', 1) - 1
2030 playlistend = self._downloader.params.get('playlistend', -1)
2031 video_ids = video_ids[playliststart:playlistend]
2033 for id in video_ids:
2034 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2037 class PostProcessor(object):
2038 """Post Processor class.
2040 PostProcessor objects can be added to downloaders with their
2041 add_post_processor() method. When the downloader has finished a
2042 successful download, it will take its internal chain of PostProcessors
2043 and start calling the run() method on each one of them, first with
2044 an initial argument and then with the returned value of the previous
2047 The chain will be stopped if one of them ever returns None or the end
2048 of the chain is reached.
2050 PostProcessor objects follow a "mutual registration" process similar
2051 to InfoExtractor objects.
2056 def __init__(self, downloader=None):
2057 self._downloader = downloader
2059 def set_downloader(self, downloader):
2060 """Sets the downloader for this PP."""
2061 self._downloader = downloader
2063 def run(self, information):
2064 """Run the PostProcessor.
2066 The "information" argument is a dictionary like the ones
2067 composed by InfoExtractors. The only difference is that this
2068 one has an extra field called "filepath" that points to the
2071 When this method returns None, the postprocessing chain is
2072 stopped. However, this method may return an information
2073 dictionary that will be passed to the next postprocessing
2074 object in the chain. It can be the one it received after
2075 changing some fields.
2077 In addition, this method may raise a PostProcessingError
2078 exception that will be taken into account by the downloader
2081 return information # by default, do nothing
2083 ### MAIN PROGRAM ###
2084 if __name__ == '__main__':
2086 # Modules needed only when running the main program
2090 # Function to update the program file with the latest version from bitbucket.org
2091 def update_self(downloader, filename):
2092 # Note: downloader only used for options
2093 if not os.access (filename, os.W_OK):
2094 sys.exit('ERROR: no write permissions on %s' % filename)
2096 downloader.to_screen('Updating to latest stable version...')
2097 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2098 latest_version = urllib.urlopen(latest_url).read().strip()
2099 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2100 newcontent = urllib.urlopen(prog_url).read()
2101 stream = open(filename, 'w')
2102 stream.write(newcontent)
2104 downloader.to_screen('Updated to version %s' % latest_version)
2106 # Parse command line
2107 parser = optparse.OptionParser(
2108 usage='Usage: %prog [options] url...',
2109 version='2010.10.24',
2110 conflict_handler='resolve',
2113 parser.add_option('-h', '--help',
2114 action='help', help='print this help text and exit')
2115 parser.add_option('-v', '--version',
2116 action='version', help='print program version and exit')
2117 parser.add_option('-U', '--update',
2118 action='store_true', dest='update_self', help='update this program to latest stable version')
2119 parser.add_option('-i', '--ignore-errors',
2120 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2121 parser.add_option('-r', '--rate-limit',
2122 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2123 parser.add_option('-R', '--retries',
2124 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2125 parser.add_option('--playlist-start',
2126 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2127 parser.add_option('--playlist-end',
2128 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2130 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2131 authentication.add_option('-u', '--username',
2132 dest='username', metavar='USERNAME', help='account username')
2133 authentication.add_option('-p', '--password',
2134 dest='password', metavar='PASSWORD', help='account password')
2135 authentication.add_option('-n', '--netrc',
2136 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2137 parser.add_option_group(authentication)
2139 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2140 video_format.add_option('-f', '--format',
2141 action='store', dest='format', metavar='FORMAT', help='video format code')
2142 video_format.add_option('-m', '--mobile-version',
2143 action='store_const', dest='format', help='alias for -f 17', const='17')
2144 video_format.add_option('--all-formats',
2145 action='store_const', dest='format', help='download all available video formats', const='-1')
2146 video_format.add_option('--max-quality',
2147 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2148 video_format.add_option('-b', '--best-quality',
2149 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2150 parser.add_option_group(video_format)
2152 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2153 verbosity.add_option('-q', '--quiet',
2154 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2155 verbosity.add_option('-s', '--simulate',
2156 action='store_true', dest='simulate', help='do not download video', default=False)
2157 verbosity.add_option('-g', '--get-url',
2158 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2159 verbosity.add_option('-e', '--get-title',
2160 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2161 verbosity.add_option('--get-thumbnail',
2162 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2163 verbosity.add_option('--get-description',
2164 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2165 verbosity.add_option('--no-progress',
2166 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2167 parser.add_option_group(verbosity)
2169 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2170 filesystem.add_option('-t', '--title',
2171 action='store_true', dest='usetitle', help='use title in file name', default=False)
2172 filesystem.add_option('-l', '--literal',
2173 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2174 filesystem.add_option('-A', '--auto-number',
2175 action='store_true', dest='autonumber', help='number downloaded URLs starting from 00000', default=False)
2176 filesystem.add_option('-o', '--output',
2177 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2178 filesystem.add_option('-a', '--batch-file',
2179 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2180 filesystem.add_option('-w', '--no-overwrites',
2181 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2182 filesystem.add_option('-c', '--continue',
2183 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2184 filesystem.add_option('--cookies',
2185 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2186 parser.add_option_group(filesystem)
2188 (opts, args) = parser.parse_args()
2190 # Open appropriate CookieJar
2191 if opts.cookiefile is None:
2192 jar = cookielib.CookieJar()
2195 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2196 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2198 except (IOError, OSError), err:
2199 sys.exit(u'ERROR: unable to open cookie file')
2201 # General configuration
2202 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2203 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2204 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2205 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2207 # Batch file verification
2209 if opts.batchfile is not None:
2211 if opts.batchfile == '-':
2214 batchfd = open(opts.batchfile, 'r')
2215 batchurls = batchfd.readlines()
2216 batchurls = [x.strip() for x in batchurls]
2217 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2219 sys.exit(u'ERROR: batch file could not be read')
2220 all_urls = batchurls + args
2222 # Conflicting, missing and erroneous options
2223 if opts.bestquality:
2224 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2225 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2226 parser.error(u'using .netrc conflicts with giving username/password')
2227 if opts.password is not None and opts.username is None:
2228 parser.error(u'account username missing')
2229 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2230 parser.error(u'using output template conflicts with using title, literal title or auto number')
2231 if opts.usetitle and opts.useliteral:
2232 parser.error(u'using title conflicts with using literal title')
2233 if opts.username is not None and opts.password is None:
2234 opts.password = getpass.getpass(u'Type account password and press return:')
2235 if opts.ratelimit is not None:
2236 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2237 if numeric_limit is None:
2238 parser.error(u'invalid rate limit specified')
2239 opts.ratelimit = numeric_limit
2240 if opts.retries is not None:
2242 opts.retries = long(opts.retries)
2243 except (TypeError, ValueError), err:
2244 parser.error(u'invalid retry count specified')
2246 opts.playliststart = long(opts.playliststart)
2247 if opts.playliststart <= 0:
2249 except (TypeError, ValueError), err:
2250 parser.error(u'invalid playlist start number specified')
2252 opts.playlistend = long(opts.playlistend)
2253 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2255 except (TypeError, ValueError), err:
2256 parser.error(u'invalid playlist end number specified')
2258 # Information extractors
2259 youtube_ie = YoutubeIE()
2260 metacafe_ie = MetacafeIE(youtube_ie)
2261 dailymotion_ie = DailymotionIE()
2262 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2263 youtube_user_ie = YoutubeUserIE(youtube_ie)
2264 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2265 google_ie = GoogleIE()
2266 google_search_ie = GoogleSearchIE(google_ie)
2267 photobucket_ie = PhotobucketIE()
2268 yahoo_ie = YahooIE()
2269 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2270 generic_ie = GenericIE()
2273 fd = FileDownloader({
2274 'usenetrc': opts.usenetrc,
2275 'username': opts.username,
2276 'password': opts.password,
2277 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2278 'forceurl': opts.geturl,
2279 'forcetitle': opts.gettitle,
2280 'forcethumbnail': opts.getthumbnail,
2281 'forcedescription': opts.getdescription,
2282 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2283 'format': opts.format,
2284 'format_limit': opts.format_limit,
2285 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2286 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2287 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2288 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2289 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2290 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2291 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2292 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2293 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2294 or u'%(id)s.%(ext)s'),
2295 'ignoreerrors': opts.ignoreerrors,
2296 'ratelimit': opts.ratelimit,
2297 'nooverwrites': opts.nooverwrites,
2298 'retries': opts.retries,
2299 'continuedl': opts.continue_dl,
2300 'noprogress': opts.noprogress,
2301 'playliststart': opts.playliststart,
2302 'playlistend': opts.playlistend,
2303 'logtostderr': opts.outtmpl == '-',
2305 fd.add_info_extractor(youtube_search_ie)
2306 fd.add_info_extractor(youtube_pl_ie)
2307 fd.add_info_extractor(youtube_user_ie)
2308 fd.add_info_extractor(metacafe_ie)
2309 fd.add_info_extractor(dailymotion_ie)
2310 fd.add_info_extractor(youtube_ie)
2311 fd.add_info_extractor(google_ie)
2312 fd.add_info_extractor(google_search_ie)
2313 fd.add_info_extractor(photobucket_ie)
2314 fd.add_info_extractor(yahoo_ie)
2315 fd.add_info_extractor(yahoo_search_ie)
2317 # This must come last since it's the
2318 # fallback if none of the others work
2319 fd.add_info_extractor(generic_ie)
2322 if opts.update_self:
2323 update_self(fd, sys.argv[0])
2326 if len(all_urls) < 1:
2327 if not opts.update_self:
2328 parser.error(u'you must provide at least one URL')
2331 retcode = fd.download(all_urls)
2333 # Dump cookie jar if requested
2334 if opts.cookiefile is not None:
2337 except (IOError, OSError), err:
2338 sys.exit(u'ERROR: unable to save cookie jar')
2342 except DownloadError:
2344 except SameFileError:
2345 sys.exit(u'ERROR: fixed output name but more than one file to download')
2346 except KeyboardInterrupt:
2347 sys.exit(u'\nERROR: Interrupted by user')