2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse import parse_qs
27 from cgi import parse_qs
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref = locale.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity = matchobj.group(1)
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
72 numstr = u'0%s' % numstr
75 return unichr(long(numstr, base))
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 continuedl: Try to continue downloads if possible.
197 noprogress: Do not print the progress bar.
203 _download_retcode = None
204 _num_downloads = None
206 def __init__(self, params):
207 """Create a FileDownloader object with the given options."""
210 self._download_retcode = 0
211 self._num_downloads = 0
215 def pmkdir(filename):
216 """Create directory components in filename. Similar to Unix "mkdir -p"."""
217 components = filename.split(os.sep)
218 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
219 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
220 for dir in aggregate:
221 if not os.path.exists(dir):
225 def format_bytes(bytes):
228 if type(bytes) is str:
233 exponent = long(math.log(bytes, 1024.0))
234 suffix = 'bkMGTPEZY'[exponent]
235 converted = float(bytes) / float(1024**exponent)
236 return '%.2f%s' % (converted, suffix)
239 def calc_percent(byte_counter, data_len):
242 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245 def calc_eta(start, now, total, current):
249 if current == 0 or dif < 0.001: # One millisecond
251 rate = float(current) / dif
252 eta = long((float(total) - float(current)) / rate)
253 (eta_mins, eta_secs) = divmod(eta, 60)
256 return '%02d:%02d' % (eta_mins, eta_secs)
259 def calc_speed(start, now, bytes):
261 if bytes == 0 or dif < 0.001: # One millisecond
262 return '%10s' % '---b/s'
263 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266 def best_block_size(elapsed_time, bytes):
267 new_min = max(bytes / 2.0, 1.0)
268 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
269 if elapsed_time < 0.001:
271 rate = bytes / elapsed_time
279 def parse_bytes(bytestr):
280 """Parse a string indicating a byte quantity into a long integer."""
281 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 number = float(matchobj.group(1))
285 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
286 return long(round(number * multiplier))
290 """Verify a URL is valid and data could be downloaded. Return real data URL."""
291 request = urllib2.Request(url, None, std_headers)
292 data = urllib2.urlopen(request)
298 def add_info_extractor(self, ie):
299 """Add an InfoExtractor object to the end of the list."""
301 ie.set_downloader(self)
303 def add_post_processor(self, pp):
304 """Add a PostProcessor object to the end of the chain."""
306 pp.set_downloader(self)
308 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
309 """Print message to stdout if not in quiet mode."""
311 if not self.params.get('quiet', False):
312 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314 except (UnicodeEncodeError), err:
315 if not ignore_encoding_errors:
318 def to_stderr(self, message):
319 """Print message to stderr."""
320 print >>sys.stderr, message.encode(preferredencoding())
322 def fixed_template(self):
323 """Checks if the output template is fixed."""
324 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326 def trouble(self, message=None):
327 """Determine action to take when a download problem appears.
329 Depending on if the downloader has been configured to ignore
330 download errors or not, this method may throw an exception or
331 not when errors are found, after printing the message.
333 if message is not None:
334 self.to_stderr(message)
335 if not self.params.get('ignoreerrors', False):
336 raise DownloadError(message)
337 self._download_retcode = 1
339 def slow_down(self, start_time, byte_counter):
340 """Sleep if the download speed is over the rate limit."""
341 rate_limit = self.params.get('ratelimit', None)
342 if rate_limit is None or byte_counter == 0:
345 elapsed = now - start_time
348 speed = float(byte_counter) / elapsed
349 if speed > rate_limit:
350 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352 def report_destination(self, filename):
353 """Report destination filename."""
354 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
357 """Report download progress."""
358 if self.params.get('noprogress', False):
360 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
361 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363 def report_resuming_byte(self, resume_len):
364 """Report attemtp to resume at given byte."""
365 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367 def report_file_already_downloaded(self, file_name):
368 """Report file has already been fully downloaded."""
370 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
371 except (UnicodeEncodeError), err:
372 self.to_stdout(u'[download] The file has already been downloaded')
374 def report_unable_to_resume(self):
375 """Report it was impossible to resume download."""
376 self.to_stdout(u'[download] Unable to resume')
378 def report_finish(self):
379 """Report download finished."""
380 if self.params.get('noprogress', False):
381 self.to_stdout(u'[download] Download completed')
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Verify URL if it's an HTTP one
390 if info_dict['url'].startswith('http'):
392 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
393 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise UnavailableFormatError
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
402 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
403 if self.params.get('forcedescription', False) and 'description' in info_dict:
404 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
409 template_dict = dict(info_dict)
410 template_dict['epoch'] = unicode(long(time.time()))
411 template_dict['ord'] = unicode('%05d' % self._num_downloads)
412 filename = self.params['outtmpl'] % template_dict
413 except (ValueError, KeyError), err:
414 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
415 if self.params.get('nooverwrites', False) and os.path.exists(filename):
416 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
420 self.pmkdir(filename)
421 except (OSError, IOError), err:
422 self.trouble('ERROR: unable to create directories: %s' % str(err))
426 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
427 except (OSError, IOError), err:
428 raise UnavailableFormatError
429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
430 self.trouble('ERROR: unable to download video data: %s' % str(err))
432 except (ContentTooShortError, ), err:
433 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
438 self.post_process(filename, info_dict)
439 except (PostProcessingError), err:
440 self.trouble('ERROR: postprocessing: %s' % str(err))
443 def download(self, url_list):
444 """Download a given list of URLs."""
445 if len(url_list) > 1 and self.fixed_template():
446 raise SameFileError(self.params['outtmpl'])
449 suitable_found = False
451 # Go to next InfoExtractor if not suitable
452 if not ie.suitable(url):
455 # Suitable InfoExtractor found
456 suitable_found = True
458 # Extract information from URL and process it
461 # Suitable InfoExtractor had been found; go to next URL
464 if not suitable_found:
465 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
467 return self._download_retcode
469 def post_process(self, filename, ie_info):
470 """Run the postprocessing chain on the given file."""
472 info['filepath'] = filename
478 def _download_with_rtmpdump(self, filename, url):
479 self.report_destination(filename)
481 # Check for rtmpdump first
483 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
484 except (OSError, IOError):
485 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
488 # Download using rtmpdump. rtmpdump returns exit code 2 when
489 # the connection was interrumpted and resuming appears to be
490 # possible. This is part of rtmpdump's normal usage, AFAIK.
491 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
492 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
493 while retval == 2 or retval == 1:
494 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
495 time.sleep(2.0) # This seems to be needed
496 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
498 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
501 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
504 def _do_download(self, filename, url):
505 # Attempt to download using rtmpdump
506 if url.startswith('rtmp'):
507 return self._download_with_rtmpdump(filename, url)
511 basic_request = urllib2.Request(url, None, std_headers)
512 request = urllib2.Request(url, None, std_headers)
514 # Establish possible resume length
515 if os.path.isfile(filename):
516 resume_len = os.path.getsize(filename)
520 # Request parameters in case of being able to resume
521 if self.params.get('continuedl', False) and resume_len != 0:
522 self.report_resuming_byte(resume_len)
523 request.add_header('Range','bytes=%d-' % resume_len)
526 # Establish connection
528 data = urllib2.urlopen(request)
529 except (urllib2.HTTPError, ), err:
530 if err.code != 416: # 416 is 'Requested range not satisfiable'
533 data = urllib2.urlopen(basic_request)
534 content_length = data.info()['Content-Length']
536 if content_length is not None and long(content_length) == resume_len:
537 # Because the file had already been fully downloaded
538 self.report_file_already_downloaded(filename)
539 self._num_downloads += 1
542 # Because the server didn't let us
543 self.report_unable_to_resume()
546 data_len = data.info().get('Content-length', None)
547 data_len_str = self.format_bytes(data_len)
554 data_block = data.read(block_size)
556 data_block_len = len(data_block)
557 if data_block_len == 0:
559 byte_counter += data_block_len
561 # Open file just in time
564 (stream, filename) = sanitize_open(filename, open_mode)
565 self.report_destination(filename)
566 self._num_downloads += 1
567 except (OSError, IOError), err:
568 self.trouble('ERROR: unable to open for writing: %s' % str(err))
571 stream.write(data_block)
572 except (IOError, OSError), err:
573 self.trouble('\nERROR: unable to write data: %s' % str(err))
574 block_size = self.best_block_size(after - before, data_block_len)
577 percent_str = self.calc_percent(byte_counter, data_len)
578 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
579 speed_str = self.calc_speed(start, time.time(), byte_counter)
580 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
583 self.slow_down(start, byte_counter)
586 if data_len is not None and str(byte_counter) != data_len:
587 raise ContentTooShortError(byte_counter, long(data_len))
590 class InfoExtractor(object):
591 """Information Extractor class.
593 Information extractors are the classes that, given a URL, extract
594 information from the video (or videos) the URL refers to. This
595 information includes the real video URL, the video title and simplified
596 title, author and others. The information is stored in a dictionary
597 which is then passed to the FileDownloader. The FileDownloader
598 processes this information possibly downloading the video to the file
599 system, among other possible outcomes. The dictionaries must include
600 the following fields:
602 id: Video identifier.
603 url: Final video URL.
604 uploader: Nickname of the video uploader.
605 title: Literal title.
606 stitle: Simplified title.
607 ext: Video filename extension.
608 format: Video format.
610 The following fields are optional. Their primary purpose is to allow
611 youtube-dl to serve as the backend for a video search function, such
612 as the one in youtube2mp3. They are only used when their respective
613 forced printing functions are called:
615 thumbnail: Full URL to a video thumbnail image.
616 description: One-line video description.
618 Subclasses of this one should re-define the _real_initialize() and
619 _real_extract() methods, as well as the suitable() static method.
620 Probably, they should also be instantiated and added to the main
627 def __init__(self, downloader=None):
628 """Constructor. Receives an optional downloader."""
630 self.set_downloader(downloader)
634 """Receives a URL and returns True if suitable for this IE."""
637 def initialize(self):
638 """Initializes an instance (authentication, etc)."""
640 self._real_initialize()
643 def extract(self, url):
644 """Extracts URL information and returns it in list of dicts."""
646 return self._real_extract(url)
648 def set_downloader(self, downloader):
649 """Sets the downloader for this IE."""
650 self._downloader = downloader
652 def _real_initialize(self):
653 """Real initialization process. Redefine in subclasses."""
656 def _real_extract(self, url):
657 """Real extraction process. Redefine in subclasses."""
660 class YoutubeIE(InfoExtractor):
661 """Information extractor for youtube.com."""
663 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
664 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
665 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
666 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
667 _NETRC_MACHINE = 'youtube'
668 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
669 _video_extensions = {
679 return (re.match(YoutubeIE._VALID_URL, url) is not None)
681 def report_lang(self):
682 """Report attempt to set language."""
683 self._downloader.to_stdout(u'[youtube] Setting language')
685 def report_login(self):
686 """Report attempt to log in."""
687 self._downloader.to_stdout(u'[youtube] Logging in')
689 def report_age_confirmation(self):
690 """Report attempt to confirm age."""
691 self._downloader.to_stdout(u'[youtube] Confirming age')
693 def report_video_info_webpage_download(self, video_id):
694 """Report attempt to download video info webpage."""
695 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
697 def report_information_extraction(self, video_id):
698 """Report attempt to extract video information."""
699 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
701 def report_unavailable_format(self, video_id, format):
702 """Report extracted video URL."""
703 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
705 def report_rtmp_download(self):
706 """Indicate the download will use the RTMP protocol."""
707 self._downloader.to_stdout(u'[youtube] RTMP download detected')
709 def _real_initialize(self):
710 if self._downloader is None:
715 downloader_params = self._downloader.params
717 # Attempt to use provided username and password or .netrc data
718 if downloader_params.get('username', None) is not None:
719 username = downloader_params['username']
720 password = downloader_params['password']
721 elif downloader_params.get('usenetrc', False):
723 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
728 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
729 except (IOError, netrc.NetrcParseError), err:
730 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
734 request = urllib2.Request(self._LANG_URL, None, std_headers)
737 urllib2.urlopen(request).read()
738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
739 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
742 # No authentication to be performed
748 'current_form': 'loginForm',
750 'action_login': 'Log In',
751 'username': username,
752 'password': password,
754 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
757 login_results = urllib2.urlopen(request).read()
758 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
759 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
762 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
768 'action_confirm': 'Confirm',
770 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
772 self.report_age_confirmation()
773 age_results = urllib2.urlopen(request).read()
774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
775 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
778 def _real_extract(self, url):
779 # Extract video id from URL
780 mobj = re.match(self._VALID_URL, url)
782 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
784 video_id = mobj.group(2)
786 # Downloader parameters
791 if self._downloader is not None:
792 params = self._downloader.params
793 format_param = params.get('format', None)
794 if format_param == '0':
795 format_param = self._available_formats[quality_index]
797 elif format_param == '-1':
798 format_param = self._available_formats[quality_index]
803 video_extension = self._video_extensions.get(format_param, 'flv')
806 self.report_video_info_webpage_download(video_id)
807 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
808 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
809 % (video_id, el_type))
810 request = urllib2.Request(video_info_url, None, std_headers)
812 video_info_webpage = urllib2.urlopen(request).read()
813 video_info = parse_qs(video_info_webpage)
814 if 'token' in video_info:
816 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
817 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
819 self.report_information_extraction(video_id)
822 if 'token' not in video_info:
823 # Attempt to see if YouTube has issued an error message
824 if 'reason' not in video_info:
825 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
826 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
827 stream.write(video_info_webpage)
830 reason = urllib.unquote_plus(video_info['reason'][0])
831 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
833 token = urllib.unquote_plus(video_info['token'][0])
834 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
835 if format_param is not None:
836 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
838 # Check possible RTMP download
839 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
840 self.report_rtmp_download()
841 video_real_url = video_info['conn'][0]
844 if 'author' not in video_info:
845 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
847 video_uploader = urllib.unquote_plus(video_info['author'][0])
850 if 'title' not in video_info:
851 self._downloader.trouble(u'ERROR: unable to extract video title')
853 video_title = urllib.unquote_plus(video_info['title'][0])
854 video_title = video_title.decode('utf-8')
855 video_title = sanitize_title(video_title)
858 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
859 simple_title = simple_title.strip(ur'_')
862 if 'thumbnail_url' not in video_info:
863 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
865 else: # don't panic if we can't find it
866 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
868 # get video description
869 video_description = 'No description available.' # we need something to pass to self._downloader
870 # this requires an additional HTTP request and a little
871 # more time, so don't do it unless absolutely necessary
872 if self._downloader.params.get('forcedescription', False):
873 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
874 request = urllib2.Request(video_page_url, None, std_headers)
876 video_page_webpage = urllib2.urlopen(request).read()
877 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
879 video_description = mobj.group(1)
880 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
881 pass # don't panic if we can't find it
884 # Process video information
885 self._downloader.process_info({
886 'id': video_id.decode('utf-8'),
887 'url': video_real_url.decode('utf-8'),
888 'uploader': video_uploader.decode('utf-8'),
889 'title': video_title,
890 'stitle': simple_title,
891 'ext': video_extension.decode('utf-8'),
892 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
893 'thumbnail': video_thumbnail.decode('utf-8'),
894 'description': video_description.decode('utf-8'),
898 if quality_index == len(self._available_formats):
903 format_param = self._available_formats[quality_index]
907 except UnavailableFormatError, err:
908 if best_quality or all_formats:
909 if quality_index == len(self._available_formats):
910 # I don't ever expect this to happen
912 self._downloader.trouble(u'ERROR: no known formats available for video')
915 self.report_unavailable_format(video_id, format_param)
917 format_param = self._available_formats[quality_index]
920 self._downloader.trouble('ERROR: format not available for video')
924 class MetacafeIE(InfoExtractor):
925 """Information Extractor for metacafe.com."""
927 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
928 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
929 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
932 def __init__(self, youtube_ie, downloader=None):
933 InfoExtractor.__init__(self, downloader)
934 self._youtube_ie = youtube_ie
938 return (re.match(MetacafeIE._VALID_URL, url) is not None)
940 def report_disclaimer(self):
941 """Report disclaimer retrieval."""
942 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
944 def report_age_confirmation(self):
945 """Report attempt to confirm age."""
946 self._downloader.to_stdout(u'[metacafe] Confirming age')
948 def report_download_webpage(self, video_id):
949 """Report webpage download."""
950 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
952 def report_extraction(self, video_id):
953 """Report information extraction."""
954 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
956 def _real_initialize(self):
957 # Retrieve disclaimer
958 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
960 self.report_disclaimer()
961 disclaimer = urllib2.urlopen(request).read()
962 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
963 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
969 'submit': "Continue - I'm over 18",
971 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
973 self.report_age_confirmation()
974 disclaimer = urllib2.urlopen(request).read()
975 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
976 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
979 def _real_extract(self, url):
980 # Extract id and simplified title from URL
981 mobj = re.match(self._VALID_URL, url)
983 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
986 video_id = mobj.group(1)
988 # Check if video comes from YouTube
989 mobj2 = re.match(r'^yt-(.*)$', video_id)
990 if mobj2 is not None:
991 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
994 simple_title = mobj.group(2).decode('utf-8')
995 video_extension = 'flv'
997 # Retrieve video webpage to extract further information
998 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1000 self.report_download_webpage(video_id)
1001 webpage = urllib2.urlopen(request).read()
1002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1006 # Extract URL, uploader and title from webpage
1007 self.report_extraction(video_id)
1008 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1010 self._downloader.trouble(u'ERROR: unable to extract media URL')
1012 mediaURL = urllib.unquote(mobj.group(1))
1014 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1016 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1018 #gdaKey = mobj.group(1)
1020 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1022 video_url = mediaURL
1024 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1026 self._downloader.trouble(u'ERROR: unable to extract title')
1028 video_title = mobj.group(1).decode('utf-8')
1029 video_title = sanitize_title(video_title)
1031 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1033 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1035 video_uploader = mobj.group(1)
1038 # Process video information
1039 self._downloader.process_info({
1040 'id': video_id.decode('utf-8'),
1041 'url': video_url.decode('utf-8'),
1042 'uploader': video_uploader.decode('utf-8'),
1043 'title': video_title,
1044 'stitle': simple_title,
1045 'ext': video_extension.decode('utf-8'),
1048 except UnavailableFormatError:
1049 self._downloader.trouble(u'ERROR: format not available for video')
1052 class GoogleIE(InfoExtractor):
1053 """Information extractor for video.google.com."""
1055 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1057 def __init__(self, downloader=None):
1058 InfoExtractor.__init__(self, downloader)
1062 return (re.match(GoogleIE._VALID_URL, url) is not None)
1064 def report_download_webpage(self, video_id):
1065 """Report webpage download."""
1066 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1068 def report_extraction(self, video_id):
1069 """Report information extraction."""
1070 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1072 def _real_initialize(self):
1075 def _real_extract(self, url):
1076 # Extract id from URL
1077 mobj = re.match(self._VALID_URL, url)
1079 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1082 video_id = mobj.group(1)
1084 video_extension = 'mp4'
1086 # Retrieve video webpage to extract further information
1087 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1089 self.report_download_webpage(video_id)
1090 webpage = urllib2.urlopen(request).read()
1091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1092 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1095 # Extract URL, uploader, and title from webpage
1096 self.report_extraction(video_id)
1097 mobj = re.search(r"download_url:'([^']+)'", webpage)
1099 video_extension = 'flv'
1100 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1102 self._downloader.trouble(u'ERROR: unable to extract media URL')
1104 mediaURL = urllib.unquote(mobj.group(1))
1105 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1106 mediaURL = mediaURL.replace('\\x26', '\x26')
1108 video_url = mediaURL
1110 mobj = re.search(r'<title>(.*)</title>', webpage)
1112 self._downloader.trouble(u'ERROR: unable to extract title')
1114 video_title = mobj.group(1).decode('utf-8')
1115 video_title = sanitize_title(video_title)
1116 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1118 # Extract video description
1119 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1121 self._downloader.trouble(u'ERROR: unable to extract video description')
1123 video_description = mobj.group(1).decode('utf-8')
1124 if not video_description:
1125 video_description = 'No description available.'
1127 # Extract video thumbnail
1128 if self._downloader.params.get('forcethumbnail', False):
1129 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1131 webpage = urllib2.urlopen(request).read()
1132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1135 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1137 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1139 video_thumbnail = mobj.group(1)
1140 else: # we need something to pass to process_info
1141 video_thumbnail = ''
1145 # Process video information
1146 self._downloader.process_info({
1147 'id': video_id.decode('utf-8'),
1148 'url': video_url.decode('utf-8'),
1150 'title': video_title,
1151 'stitle': simple_title,
1152 'ext': video_extension.decode('utf-8'),
1155 except UnavailableFormatError:
1156 self._downloader.trouble(u'ERROR: format not available for video')
1159 class PhotobucketIE(InfoExtractor):
1160 """Information extractor for photobucket.com."""
1162 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1164 def __init__(self, downloader=None):
1165 InfoExtractor.__init__(self, downloader)
1169 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1171 def report_download_webpage(self, video_id):
1172 """Report webpage download."""
1173 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1175 def report_extraction(self, video_id):
1176 """Report information extraction."""
1177 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1179 def _real_initialize(self):
1182 def _real_extract(self, url):
1183 # Extract id from URL
1184 mobj = re.match(self._VALID_URL, url)
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 video_id = mobj.group(1)
1191 video_extension = 'flv'
1193 # Retrieve video webpage to extract further information
1194 request = urllib2.Request(url)
1196 self.report_download_webpage(video_id)
1197 webpage = urllib2.urlopen(request).read()
1198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1202 # Extract URL, uploader, and title from webpage
1203 self.report_extraction(video_id)
1204 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1206 self._downloader.trouble(u'ERROR: unable to extract media URL')
1208 mediaURL = urllib.unquote(mobj.group(1))
1210 video_url = mediaURL
1212 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1214 self._downloader.trouble(u'ERROR: unable to extract title')
1216 video_title = mobj.group(1).decode('utf-8')
1217 video_title = sanitize_title(video_title)
1218 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1220 video_uploader = mobj.group(2).decode('utf-8')
1223 # Process video information
1224 self._downloader.process_info({
1225 'id': video_id.decode('utf-8'),
1226 'url': video_url.decode('utf-8'),
1227 'uploader': video_uploader,
1228 'title': video_title,
1229 'stitle': simple_title,
1230 'ext': video_extension.decode('utf-8'),
1233 except UnavailableFormatError:
1234 self._downloader.trouble(u'ERROR: format not available for video')
1237 class YahooIE(InfoExtractor):
1238 """Information extractor for video.yahoo.com."""
1240 # _VALID_URL matches all Yahoo! Video URLs
1241 # _VPAGE_URL matches only the extractable '/watch/' URLs
1242 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1243 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1245 def __init__(self, downloader=None):
1246 InfoExtractor.__init__(self, downloader)
1250 return (re.match(YahooIE._VALID_URL, url) is not None)
1252 def report_download_webpage(self, video_id):
1253 """Report webpage download."""
1254 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1256 def report_extraction(self, video_id):
1257 """Report information extraction."""
1258 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1260 def _real_initialize(self):
1263 def _real_extract(self, url):
1264 # Extract ID from URL
1265 mobj = re.match(self._VALID_URL, url)
1267 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1270 video_id = mobj.group(2)
1271 video_extension = 'flv'
1273 # Rewrite valid but non-extractable URLs as
1274 # extractable English language /watch/ URLs
1275 if re.match(self._VPAGE_URL, url) is None:
1276 request = urllib2.Request(url)
1278 webpage = urllib2.urlopen(request).read()
1279 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1283 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1285 self._downloader.trouble(u'ERROR: Unable to extract id field')
1287 yahoo_id = mobj.group(1)
1289 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1291 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1293 yahoo_vid = mobj.group(1)
1295 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1296 return self._real_extract(url)
1298 # Retrieve video webpage to extract further information
1299 request = urllib2.Request(url)
1301 self.report_download_webpage(video_id)
1302 webpage = urllib2.urlopen(request).read()
1303 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1304 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1307 # Extract uploader and title from webpage
1308 self.report_extraction(video_id)
1309 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1311 self._downloader.trouble(u'ERROR: unable to extract video title')
1313 video_title = mobj.group(1).decode('utf-8')
1314 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1316 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1320 video_uploader = mobj.group(1).decode('utf-8')
1322 # Extract video thumbnail
1323 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1325 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1327 video_thumbnail = mobj.group(1).decode('utf-8')
1329 # Extract video description
1330 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1332 self._downloader.trouble(u'ERROR: unable to extract video description')
1334 video_description = mobj.group(1).decode('utf-8')
1335 if not video_description: video_description = 'No description available.'
1337 # Extract video height and width
1338 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1340 self._downloader.trouble(u'ERROR: unable to extract video height')
1342 yv_video_height = mobj.group(1)
1344 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1346 self._downloader.trouble(u'ERROR: unable to extract video width')
1348 yv_video_width = mobj.group(1)
1350 # Retrieve video playlist to extract media URL
1351 # I'm not completely sure what all these options are, but we
1352 # seem to need most of them, otherwise the server sends a 401.
1353 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1354 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1355 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1356 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1357 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1359 self.report_download_webpage(video_id)
1360 webpage = urllib2.urlopen(request).read()
1361 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1362 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1365 # Extract media URL from playlist XML
1366 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1368 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1370 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1371 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1374 # Process video information
1375 self._downloader.process_info({
1376 'id': video_id.decode('utf-8'),
1378 'uploader': video_uploader,
1379 'title': video_title,
1380 'stitle': simple_title,
1381 'ext': video_extension.decode('utf-8'),
1382 'thumbnail': video_thumbnail.decode('utf-8'),
1383 'description': video_description,
1384 'thumbnail': video_thumbnail,
1385 'description': video_description,
1387 except UnavailableFormatError:
1388 self._downloader.trouble(u'ERROR: format not available for video')
1391 class GenericIE(InfoExtractor):
1392 """Generic last-resort information extractor."""
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1401 def report_download_webpage(self, video_id):
1402 """Report webpage download."""
1403 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1404 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1406 def report_extraction(self, video_id):
1407 """Report information extraction."""
1408 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1410 def _real_initialize(self):
1413 def _real_extract(self, url):
1414 video_id = url.split('/')[-1]
1415 request = urllib2.Request(url)
1417 self.report_download_webpage(video_id)
1418 webpage = urllib2.urlopen(request).read()
1419 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1420 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422 except ValueError, err:
1423 # since this is the last-resort InfoExtractor, if
1424 # this error is thrown, it'll be thrown here
1425 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1428 # Start with something easy: JW Player in SWFObject
1429 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1431 # Broaden the search a little bit
1432 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1434 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1437 # It's possible that one of the regexes
1438 # matched, but returned an empty group:
1439 if mobj.group(1) is None:
1440 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1443 video_url = urllib.unquote(mobj.group(1))
1444 video_id = os.path.basename(video_url)
1446 # here's a fun little line of code for you:
1447 video_extension = os.path.splitext(video_id)[1][1:]
1448 video_id = os.path.splitext(video_id)[0]
1450 # it's tempting to parse this further, but you would
1451 # have to take into account all the variations like
1452 # Video Title - Site Name
1453 # Site Name | Video Title
1454 # Video Title - Tagline | Site Name
1455 # and so on and so forth; it's just not practical
1456 mobj = re.search(r'<title>(.*)</title>', webpage)
1458 self._downloader.trouble(u'ERROR: unable to extract title')
1460 video_title = mobj.group(1).decode('utf-8')
1461 video_title = sanitize_title(video_title)
1462 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1464 # video uploader is domain name
1465 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1467 self._downloader.trouble(u'ERROR: unable to extract title')
1469 video_uploader = mobj.group(1).decode('utf-8')
1472 # Process video information
1473 self._downloader.process_info({
1474 'id': video_id.decode('utf-8'),
1475 'url': video_url.decode('utf-8'),
1476 'uploader': video_uploader,
1477 'title': video_title,
1478 'stitle': simple_title,
1479 'ext': video_extension.decode('utf-8'),
1482 except UnavailableFormatError:
1483 self._downloader.trouble(u'ERROR: format not available for video')
1486 class YoutubeSearchIE(InfoExtractor):
1487 """Information Extractor for YouTube search queries."""
1488 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1489 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1490 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1491 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1493 _max_youtube_results = 1000
1495 def __init__(self, youtube_ie, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1497 self._youtube_ie = youtube_ie
1501 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1503 def report_download_page(self, query, pagenum):
1504 """Report attempt to download playlist page with given number."""
1505 query = query.decode(preferredencoding())
1506 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1508 def _real_initialize(self):
1509 self._youtube_ie.initialize()
1511 def _real_extract(self, query):
1512 mobj = re.match(self._VALID_QUERY, query)
1514 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1517 prefix, query = query.split(':')
1519 query = query.encode('utf-8')
1521 self._download_n_results(query, 1)
1523 elif prefix == 'all':
1524 self._download_n_results(query, self._max_youtube_results)
1530 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1532 elif n > self._max_youtube_results:
1533 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1534 n = self._max_youtube_results
1535 self._download_n_results(query, n)
1537 except ValueError: # parsing prefix as integer fails
1538 self._download_n_results(query, 1)
1541 def _download_n_results(self, query, n):
1542 """Downloads a specified number of results for a query"""
1545 already_seen = set()
1549 self.report_download_page(query, pagenum)
1550 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1551 request = urllib2.Request(result_url, None, std_headers)
1553 page = urllib2.urlopen(request).read()
1554 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1558 # Extract video identifiers
1559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1561 if video_id not in already_seen:
1562 video_ids.append(video_id)
1563 already_seen.add(video_id)
1564 if len(video_ids) == n:
1565 # Specified n videos reached
1566 for id in video_ids:
1567 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1570 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1571 for id in video_ids:
1572 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1575 pagenum = pagenum + 1
1577 class GoogleSearchIE(InfoExtractor):
1578 """Information Extractor for Google Video search queries."""
1579 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1580 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1581 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1582 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1584 _max_google_results = 1000
1586 def __init__(self, google_ie, downloader=None):
1587 InfoExtractor.__init__(self, downloader)
1588 self._google_ie = google_ie
1592 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1594 def report_download_page(self, query, pagenum):
1595 """Report attempt to download playlist page with given number."""
1596 query = query.decode(preferredencoding())
1597 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1599 def _real_initialize(self):
1600 self._google_ie.initialize()
1602 def _real_extract(self, query):
1603 mobj = re.match(self._VALID_QUERY, query)
1605 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1608 prefix, query = query.split(':')
1610 query = query.encode('utf-8')
1612 self._download_n_results(query, 1)
1614 elif prefix == 'all':
1615 self._download_n_results(query, self._max_google_results)
1621 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1623 elif n > self._max_google_results:
1624 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1625 n = self._max_google_results
1626 self._download_n_results(query, n)
1628 except ValueError: # parsing prefix as integer fails
1629 self._download_n_results(query, 1)
1632 def _download_n_results(self, query, n):
1633 """Downloads a specified number of results for a query"""
1636 already_seen = set()
1640 self.report_download_page(query, pagenum)
1641 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1642 request = urllib2.Request(result_url, None, std_headers)
1644 page = urllib2.urlopen(request).read()
1645 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1646 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1649 # Extract video identifiers
1650 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651 video_id = mobj.group(1)
1652 if video_id not in already_seen:
1653 video_ids.append(video_id)
1654 already_seen.add(video_id)
1655 if len(video_ids) == n:
1656 # Specified n videos reached
1657 for id in video_ids:
1658 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1661 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1662 for id in video_ids:
1663 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1666 pagenum = pagenum + 1
1668 class YahooSearchIE(InfoExtractor):
1669 """Information Extractor for Yahoo! Video search queries."""
1670 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1671 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1672 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1673 _MORE_PAGES_INDICATOR = r'\s*Next'
1675 _max_yahoo_results = 1000
1677 def __init__(self, yahoo_ie, downloader=None):
1678 InfoExtractor.__init__(self, downloader)
1679 self._yahoo_ie = yahoo_ie
1683 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1685 def report_download_page(self, query, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 query = query.decode(preferredencoding())
1688 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1690 def _real_initialize(self):
1691 self._yahoo_ie.initialize()
1693 def _real_extract(self, query):
1694 mobj = re.match(self._VALID_QUERY, query)
1696 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1699 prefix, query = query.split(':')
1701 query = query.encode('utf-8')
1703 self._download_n_results(query, 1)
1705 elif prefix == 'all':
1706 self._download_n_results(query, self._max_yahoo_results)
1712 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1714 elif n > self._max_yahoo_results:
1715 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1716 n = self._max_yahoo_results
1717 self._download_n_results(query, n)
1719 except ValueError: # parsing prefix as integer fails
1720 self._download_n_results(query, 1)
1723 def _download_n_results(self, query, n):
1724 """Downloads a specified number of results for a query"""
1727 already_seen = set()
1731 self.report_download_page(query, pagenum)
1732 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1733 request = urllib2.Request(result_url, None, std_headers)
1735 page = urllib2.urlopen(request).read()
1736 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1737 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1740 # Extract video identifiers
1741 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1742 video_id = mobj.group(1)
1743 if video_id not in already_seen:
1744 video_ids.append(video_id)
1745 already_seen.add(video_id)
1746 if len(video_ids) == n:
1747 # Specified n videos reached
1748 for id in video_ids:
1749 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1752 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1753 for id in video_ids:
1754 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1757 pagenum = pagenum + 1
1759 class YoutubePlaylistIE(InfoExtractor):
1760 """Information Extractor for YouTube playlists."""
1762 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1763 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1764 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1765 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1768 def __init__(self, youtube_ie, downloader=None):
1769 InfoExtractor.__init__(self, downloader)
1770 self._youtube_ie = youtube_ie
1774 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1776 def report_download_page(self, playlist_id, pagenum):
1777 """Report attempt to download playlist page with given number."""
1778 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1780 def _real_initialize(self):
1781 self._youtube_ie.initialize()
1783 def _real_extract(self, url):
1784 # Extract playlist id
1785 mobj = re.match(self._VALID_URL, url)
1787 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1790 # Download playlist pages
1791 playlist_id = mobj.group(1)
1796 self.report_download_page(playlist_id, pagenum)
1797 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1799 page = urllib2.urlopen(request).read()
1800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1801 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1804 # Extract video identifiers
1806 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1807 if mobj.group(1) not in ids_in_page:
1808 ids_in_page.append(mobj.group(1))
1809 video_ids.extend(ids_in_page)
1811 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1813 pagenum = pagenum + 1
1815 for id in video_ids:
1816 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1819 class YoutubeUserIE(InfoExtractor):
1820 """Information Extractor for YouTube users."""
1822 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1823 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1824 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1827 def __init__(self, youtube_ie, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1829 self._youtube_ie = youtube_ie
1833 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1835 def report_download_page(self, username):
1836 """Report attempt to download user page."""
1837 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1839 def _real_initialize(self):
1840 self._youtube_ie.initialize()
1842 def _real_extract(self, url):
1844 mobj = re.match(self._VALID_URL, url)
1846 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1849 # Download user page
1850 username = mobj.group(1)
1854 self.report_download_page(username)
1855 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1857 page = urllib2.urlopen(request).read()
1858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1862 # Extract video identifiers
1865 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1866 if mobj.group(1) not in ids_in_page:
1867 ids_in_page.append(mobj.group(1))
1868 video_ids.extend(ids_in_page)
1870 for id in video_ids:
1871 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1874 class PostProcessor(object):
1875 """Post Processor class.
1877 PostProcessor objects can be added to downloaders with their
1878 add_post_processor() method. When the downloader has finished a
1879 successful download, it will take its internal chain of PostProcessors
1880 and start calling the run() method on each one of them, first with
1881 an initial argument and then with the returned value of the previous
1884 The chain will be stopped if one of them ever returns None or the end
1885 of the chain is reached.
1887 PostProcessor objects follow a "mutual registration" process similar
1888 to InfoExtractor objects.
1893 def __init__(self, downloader=None):
1894 self._downloader = downloader
1896 def set_downloader(self, downloader):
1897 """Sets the downloader for this PP."""
1898 self._downloader = downloader
1900 def run(self, information):
1901 """Run the PostProcessor.
1903 The "information" argument is a dictionary like the ones
1904 composed by InfoExtractors. The only difference is that this
1905 one has an extra field called "filepath" that points to the
1908 When this method returns None, the postprocessing chain is
1909 stopped. However, this method may return an information
1910 dictionary that will be passed to the next postprocessing
1911 object in the chain. It can be the one it received after
1912 changing some fields.
1914 In addition, this method may raise a PostProcessingError
1915 exception that will be taken into account by the downloader
1918 return information # by default, do nothing
1920 ### MAIN PROGRAM ###
1921 if __name__ == '__main__':
1923 # Modules needed only when running the main program
1927 # Function to update the program file with the latest version from bitbucket.org
1928 def update_self(downloader, filename):
1929 # Note: downloader only used for options
1930 if not os.access (filename, os.W_OK):
1931 sys.exit('ERROR: no write permissions on %s' % filename)
1933 downloader.to_stdout('Updating to latest stable version...')
1934 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1935 latest_version = urllib.urlopen(latest_url).read().strip()
1936 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1937 newcontent = urllib.urlopen(prog_url).read()
1938 stream = open(filename, 'w')
1939 stream.write(newcontent)
1941 downloader.to_stdout('Updated to version %s' % latest_version)
1943 # General configuration
1944 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1945 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1946 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1948 # Parse command line
1949 parser = optparse.OptionParser(
1950 usage='Usage: %prog [options] url...',
1951 version='2010.04.04',
1952 conflict_handler='resolve',
1955 parser.add_option('-h', '--help',
1956 action='help', help='print this help text and exit')
1957 parser.add_option('-v', '--version',
1958 action='version', help='print program version and exit')
1959 parser.add_option('-U', '--update',
1960 action='store_true', dest='update_self', help='update this program to latest stable version')
1961 parser.add_option('-i', '--ignore-errors',
1962 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1963 parser.add_option('-r', '--rate-limit',
1964 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1966 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1967 authentication.add_option('-u', '--username',
1968 dest='username', metavar='UN', help='account username')
1969 authentication.add_option('-p', '--password',
1970 dest='password', metavar='PW', help='account password')
1971 authentication.add_option('-n', '--netrc',
1972 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1973 parser.add_option_group(authentication)
1975 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1976 video_format.add_option('-f', '--format',
1977 action='store', dest='format', metavar='FMT', help='video format code')
1978 video_format.add_option('-b', '--best-quality',
1979 action='store_const', dest='format', help='download the best quality video possible', const='0')
1980 video_format.add_option('-m', '--mobile-version',
1981 action='store_const', dest='format', help='alias for -f 17', const='17')
1982 video_format.add_option('-d', '--high-def',
1983 action='store_const', dest='format', help='alias for -f 22', const='22')
1984 video_format.add_option('--all-formats',
1985 action='store_const', dest='format', help='download all available video formats', const='-1')
1986 parser.add_option_group(video_format)
1988 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1989 verbosity.add_option('-q', '--quiet',
1990 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1991 verbosity.add_option('-s', '--simulate',
1992 action='store_true', dest='simulate', help='do not download video', default=False)
1993 verbosity.add_option('-g', '--get-url',
1994 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1995 verbosity.add_option('-e', '--get-title',
1996 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1997 verbosity.add_option('--get-thumbnail',
1998 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1999 verbosity.add_option('--get-description',
2000 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2001 verbosity.add_option('--no-progress',
2002 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2003 parser.add_option_group(verbosity)
2005 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2006 filesystem.add_option('-t', '--title',
2007 action='store_true', dest='usetitle', help='use title in file name', default=False)
2008 filesystem.add_option('-l', '--literal',
2009 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2010 filesystem.add_option('-o', '--output',
2011 dest='outtmpl', metavar='TPL', help='output filename template')
2012 filesystem.add_option('-a', '--batch-file',
2013 dest='batchfile', metavar='F', help='file containing URLs to download')
2014 filesystem.add_option('-w', '--no-overwrites',
2015 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2016 filesystem.add_option('-c', '--continue',
2017 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2018 parser.add_option_group(filesystem)
2020 (opts, args) = parser.parse_args()
2022 # Batch file verification
2024 if opts.batchfile is not None:
2026 batchurls = open(opts.batchfile, 'r').readlines()
2027 batchurls = [x.strip() for x in batchurls]
2028 batchurls = [x for x in batchurls if len(x) > 0]
2030 sys.exit(u'ERROR: batch file could not be read')
2031 all_urls = batchurls + args
2033 # Conflicting, missing and erroneous options
2034 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2035 parser.error(u'using .netrc conflicts with giving username/password')
2036 if opts.password is not None and opts.username is None:
2037 parser.error(u'account username missing')
2038 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2039 parser.error(u'using output template conflicts with using title or literal title')
2040 if opts.usetitle and opts.useliteral:
2041 parser.error(u'using title conflicts with using literal title')
2042 if opts.username is not None and opts.password is None:
2043 opts.password = getpass.getpass(u'Type account password and press return:')
2044 if opts.ratelimit is not None:
2045 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2046 if numeric_limit is None:
2047 parser.error(u'invalid rate limit specified')
2048 opts.ratelimit = numeric_limit
2050 # Information extractors
2051 youtube_ie = YoutubeIE()
2052 metacafe_ie = MetacafeIE(youtube_ie)
2053 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2054 youtube_user_ie = YoutubeUserIE(youtube_ie)
2055 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2056 google_ie = GoogleIE()
2057 google_search_ie = GoogleSearchIE(google_ie)
2058 photobucket_ie = PhotobucketIE()
2059 yahoo_ie = YahooIE()
2060 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2061 generic_ie = GenericIE()
2064 fd = FileDownloader({
2065 'usenetrc': opts.usenetrc,
2066 'username': opts.username,
2067 'password': opts.password,
2068 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2069 'forceurl': opts.geturl,
2070 'forcetitle': opts.gettitle,
2071 'forcethumbnail': opts.getthumbnail,
2072 'forcedescription': opts.getdescription,
2073 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2074 'format': opts.format,
2075 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2076 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2077 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2078 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2079 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2080 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2081 or u'%(id)s.%(ext)s'),
2082 'ignoreerrors': opts.ignoreerrors,
2083 'ratelimit': opts.ratelimit,
2084 'nooverwrites': opts.nooverwrites,
2085 'continuedl': opts.continue_dl,
2086 'noprogress': opts.noprogress,
2088 fd.add_info_extractor(youtube_search_ie)
2089 fd.add_info_extractor(youtube_pl_ie)
2090 fd.add_info_extractor(youtube_user_ie)
2091 fd.add_info_extractor(metacafe_ie)
2092 fd.add_info_extractor(youtube_ie)
2093 fd.add_info_extractor(google_ie)
2094 fd.add_info_extractor(google_search_ie)
2095 fd.add_info_extractor(photobucket_ie)
2096 fd.add_info_extractor(yahoo_ie)
2097 fd.add_info_extractor(yahoo_search_ie)
2099 # This must come last since it's the
2100 # fallback if none of the others work
2101 fd.add_info_extractor(generic_ie)
2104 if opts.update_self:
2105 update_self(fd, sys.argv[0])
2108 if len(all_urls) < 1:
2109 if not opts.update_self:
2110 parser.error(u'you must provide at least one URL')
2113 retcode = fd.download(all_urls)
2116 except DownloadError:
2118 except SameFileError:
2119 sys.exit(u'ERROR: fixed output name but more than one file to download')
2120 except KeyboardInterrupt:
2121 sys.exit(u'\nERROR: Interrupted by user')