2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse import parse_qs
27 from cgi import parse_qs
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref = locale.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity = matchobj.group(1)
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
72 numstr = u'0%s' % numstr
75 return unichr(long(numstr, base))
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 continuedl: Try to continue downloads if possible.
197 noprogress: Do not print the progress bar.
203 _download_retcode = None
204 _num_downloads = None
206 def __init__(self, params):
207 """Create a FileDownloader object with the given options."""
210 self._download_retcode = 0
211 self._num_downloads = 0
215 def pmkdir(filename):
216 """Create directory components in filename. Similar to Unix "mkdir -p"."""
217 components = filename.split(os.sep)
218 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
219 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
220 for dir in aggregate:
221 if not os.path.exists(dir):
225 def format_bytes(bytes):
228 if type(bytes) is str:
233 exponent = long(math.log(bytes, 1024.0))
234 suffix = 'bkMGTPEZY'[exponent]
235 converted = float(bytes) / float(1024**exponent)
236 return '%.2f%s' % (converted, suffix)
239 def calc_percent(byte_counter, data_len):
242 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245 def calc_eta(start, now, total, current):
249 if current == 0 or dif < 0.001: # One millisecond
251 rate = float(current) / dif
252 eta = long((float(total) - float(current)) / rate)
253 (eta_mins, eta_secs) = divmod(eta, 60)
256 return '%02d:%02d' % (eta_mins, eta_secs)
259 def calc_speed(start, now, bytes):
261 if bytes == 0 or dif < 0.001: # One millisecond
262 return '%10s' % '---b/s'
263 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266 def best_block_size(elapsed_time, bytes):
267 new_min = max(bytes / 2.0, 1.0)
268 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
269 if elapsed_time < 0.001:
271 rate = bytes / elapsed_time
279 def parse_bytes(bytestr):
280 """Parse a string indicating a byte quantity into a long integer."""
281 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 number = float(matchobj.group(1))
285 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
286 return long(round(number * multiplier))
290 """Verify a URL is valid and data could be downloaded. Return real data URL."""
291 request = urllib2.Request(url, None, std_headers)
292 data = urllib2.urlopen(request)
298 def add_info_extractor(self, ie):
299 """Add an InfoExtractor object to the end of the list."""
301 ie.set_downloader(self)
303 def add_post_processor(self, pp):
304 """Add a PostProcessor object to the end of the chain."""
306 pp.set_downloader(self)
308 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
309 """Print message to stdout if not in quiet mode."""
311 if not self.params.get('quiet', False):
312 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314 except (UnicodeEncodeError), err:
315 if not ignore_encoding_errors:
318 def to_stderr(self, message):
319 """Print message to stderr."""
320 print >>sys.stderr, message.encode(preferredencoding())
322 def fixed_template(self):
323 """Checks if the output template is fixed."""
324 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326 def trouble(self, message=None):
327 """Determine action to take when a download problem appears.
329 Depending on if the downloader has been configured to ignore
330 download errors or not, this method may throw an exception or
331 not when errors are found, after printing the message.
333 if message is not None:
334 self.to_stderr(message)
335 if not self.params.get('ignoreerrors', False):
336 raise DownloadError(message)
337 self._download_retcode = 1
339 def slow_down(self, start_time, byte_counter):
340 """Sleep if the download speed is over the rate limit."""
341 rate_limit = self.params.get('ratelimit', None)
342 if rate_limit is None or byte_counter == 0:
345 elapsed = now - start_time
348 speed = float(byte_counter) / elapsed
349 if speed > rate_limit:
350 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352 def report_destination(self, filename):
353 """Report destination filename."""
354 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
357 """Report download progress."""
358 if self.params.get('noprogress', False):
360 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
361 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363 def report_resuming_byte(self, resume_len):
364 """Report attemtp to resume at given byte."""
365 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367 def report_file_already_downloaded(self, file_name):
368 """Report file has already been fully downloaded."""
370 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
371 except (UnicodeEncodeError), err:
372 self.to_stdout(u'[download] The file has already been downloaded')
374 def report_unable_to_resume(self):
375 """Report it was impossible to resume download."""
376 self.to_stdout(u'[download] Unable to resume')
378 def report_finish(self):
379 """Report download finished."""
380 if self.params.get('noprogress', False):
381 self.to_stdout(u'[download] Download completed')
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Verify URL if it's an HTTP one
390 if info_dict['url'].startswith('http'):
392 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
393 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise UnavailableFormatError
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
402 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
403 if self.params.get('forcedescription', False) and 'description' in info_dict:
404 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
409 template_dict = dict(info_dict)
410 template_dict['epoch'] = unicode(long(time.time()))
411 template_dict['ord'] = unicode('%05d' % self._num_downloads)
412 filename = self.params['outtmpl'] % template_dict
413 except (ValueError, KeyError), err:
414 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
415 if self.params.get('nooverwrites', False) and os.path.exists(filename):
416 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
420 self.pmkdir(filename)
421 except (OSError, IOError), err:
422 self.trouble('ERROR: unable to create directories: %s' % str(err))
426 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
427 except (OSError, IOError), err:
428 raise UnavailableFormatError
429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
430 self.trouble('ERROR: unable to download video data: %s' % str(err))
432 except (ContentTooShortError, ), err:
433 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
438 self.post_process(filename, info_dict)
439 except (PostProcessingError), err:
440 self.trouble('ERROR: postprocessing: %s' % str(err))
443 def download(self, url_list):
444 """Download a given list of URLs."""
445 if len(url_list) > 1 and self.fixed_template():
446 raise SameFileError(self.params['outtmpl'])
449 suitable_found = False
451 # Go to next InfoExtractor if not suitable
452 if not ie.suitable(url):
455 # Suitable InfoExtractor found
456 suitable_found = True
458 # Extract information from URL and process it
461 # Suitable InfoExtractor had been found; go to next URL
464 if not suitable_found:
465 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
467 return self._download_retcode
469 def post_process(self, filename, ie_info):
470 """Run the postprocessing chain on the given file."""
472 info['filepath'] = filename
478 def _download_with_rtmpdump(self, filename, url):
479 self.report_destination(filename)
481 # Check for rtmpdump first
483 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
484 except (OSError, IOError):
485 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
488 # Download using rtmpdump. rtmpdump returns exit code 2 when
489 # the connection was interrumpted and resuming appears to be
490 # possible. This is part of rtmpdump's normal usage, AFAIK.
491 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
492 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
493 while retval == 2 or retval == 1:
494 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
495 time.sleep(2.0) # This seems to be needed
496 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
498 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
501 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
504 def _do_download(self, filename, url):
505 # Attempt to download using rtmpdump
506 if url.startswith('rtmp'):
507 return self._download_with_rtmpdump(filename, url)
511 basic_request = urllib2.Request(url, None, std_headers)
512 request = urllib2.Request(url, None, std_headers)
514 # Establish possible resume length
515 if os.path.isfile(filename):
516 resume_len = os.path.getsize(filename)
520 # Request parameters in case of being able to resume
521 if self.params.get('continuedl', False) and resume_len != 0:
522 self.report_resuming_byte(resume_len)
523 request.add_header('Range','bytes=%d-' % resume_len)
526 # Establish connection
528 data = urllib2.urlopen(request)
529 except (urllib2.HTTPError, ), err:
530 if err.code != 416: # 416 is 'Requested range not satisfiable'
533 data = urllib2.urlopen(basic_request)
534 content_length = data.info()['Content-Length']
536 if content_length is not None and long(content_length) == resume_len:
537 # Because the file had already been fully downloaded
538 self.report_file_already_downloaded(filename)
541 # Because the server didn't let us
542 self.report_unable_to_resume()
545 data_len = data.info().get('Content-length', None)
546 data_len_str = self.format_bytes(data_len)
553 data_block = data.read(block_size)
555 data_block_len = len(data_block)
556 if data_block_len == 0:
558 byte_counter += data_block_len
560 # Open file just in time
563 (stream, filename) = sanitize_open(filename, open_mode)
564 self.report_destination(filename)
565 self._num_downloads += 1
566 except (OSError, IOError), err:
567 self.trouble('ERROR: unable to open for writing: %s' % str(err))
569 stream.write(data_block)
570 block_size = self.best_block_size(after - before, data_block_len)
573 percent_str = self.calc_percent(byte_counter, data_len)
574 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
575 speed_str = self.calc_speed(start, time.time(), byte_counter)
576 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
579 self.slow_down(start, byte_counter)
582 if data_len is not None and str(byte_counter) != data_len:
583 raise ContentTooShortError(byte_counter, long(data_len))
586 class InfoExtractor(object):
587 """Information Extractor class.
589 Information extractors are the classes that, given a URL, extract
590 information from the video (or videos) the URL refers to. This
591 information includes the real video URL, the video title and simplified
592 title, author and others. The information is stored in a dictionary
593 which is then passed to the FileDownloader. The FileDownloader
594 processes this information possibly downloading the video to the file
595 system, among other possible outcomes. The dictionaries must include
596 the following fields:
598 id: Video identifier.
599 url: Final video URL.
600 uploader: Nickname of the video uploader.
601 title: Literal title.
602 stitle: Simplified title.
603 ext: Video filename extension.
604 format: Video format.
606 The following fields are optional. Their primary purpose is to allow
607 youtube-dl to serve as the backend for a video search function, such
608 as the one in youtube2mp3. They are only used when their respective
609 forced printing functions are called:
611 thumbnail: Full URL to a video thumbnail image.
612 description: One-line video description.
614 Subclasses of this one should re-define the _real_initialize() and
615 _real_extract() methods, as well as the suitable() static method.
616 Probably, they should also be instantiated and added to the main
623 def __init__(self, downloader=None):
624 """Constructor. Receives an optional downloader."""
626 self.set_downloader(downloader)
630 """Receives a URL and returns True if suitable for this IE."""
633 def initialize(self):
634 """Initializes an instance (authentication, etc)."""
636 self._real_initialize()
639 def extract(self, url):
640 """Extracts URL information and returns it in list of dicts."""
642 return self._real_extract(url)
644 def set_downloader(self, downloader):
645 """Sets the downloader for this IE."""
646 self._downloader = downloader
648 def _real_initialize(self):
649 """Real initialization process. Redefine in subclasses."""
652 def _real_extract(self, url):
653 """Real extraction process. Redefine in subclasses."""
656 class YoutubeIE(InfoExtractor):
657 """Information extractor for youtube.com."""
659 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
660 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
661 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
662 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
663 _NETRC_MACHINE = 'youtube'
664 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
665 _video_extensions = {
675 return (re.match(YoutubeIE._VALID_URL, url) is not None)
677 def report_lang(self):
678 """Report attempt to set language."""
679 self._downloader.to_stdout(u'[youtube] Setting language')
681 def report_login(self):
682 """Report attempt to log in."""
683 self._downloader.to_stdout(u'[youtube] Logging in')
685 def report_age_confirmation(self):
686 """Report attempt to confirm age."""
687 self._downloader.to_stdout(u'[youtube] Confirming age')
689 def report_video_info_webpage_download(self, video_id):
690 """Report attempt to download video info webpage."""
691 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
693 def report_information_extraction(self, video_id):
694 """Report attempt to extract video information."""
695 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
697 def report_unavailable_format(self, video_id, format):
698 """Report extracted video URL."""
699 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
701 def report_rtmp_download(self):
702 """Indicate the download will use the RTMP protocol."""
703 self._downloader.to_stdout(u'[youtube] RTMP download detected')
705 def _real_initialize(self):
706 if self._downloader is None:
711 downloader_params = self._downloader.params
713 # Attempt to use provided username and password or .netrc data
714 if downloader_params.get('username', None) is not None:
715 username = downloader_params['username']
716 password = downloader_params['password']
717 elif downloader_params.get('usenetrc', False):
719 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
724 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
725 except (IOError, netrc.NetrcParseError), err:
726 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
730 request = urllib2.Request(self._LANG_URL, None, std_headers)
733 urllib2.urlopen(request).read()
734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
735 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
738 # No authentication to be performed
744 'current_form': 'loginForm',
746 'action_login': 'Log In',
747 'username': username,
748 'password': password,
750 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
753 login_results = urllib2.urlopen(request).read()
754 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
755 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
758 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
764 'action_confirm': 'Confirm',
766 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
768 self.report_age_confirmation()
769 age_results = urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
774 def _real_extract(self, url):
775 # Extract video id from URL
776 mobj = re.match(self._VALID_URL, url)
778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
780 video_id = mobj.group(2)
782 # Downloader parameters
787 if self._downloader is not None:
788 params = self._downloader.params
789 format_param = params.get('format', None)
790 if format_param == '0':
791 format_param = self._available_formats[quality_index]
793 elif format_param == '-1':
794 format_param = self._available_formats[quality_index]
799 video_extension = self._video_extensions.get(format_param, 'flv')
802 self.report_video_info_webpage_download(video_id)
803 for el_type in ['embedded', 'detailpage', 'vevo']:
804 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
805 % (video_id, el_type))
806 request = urllib2.Request(video_info_url, None, std_headers)
808 video_info_webpage = urllib2.urlopen(request).read()
809 video_info = parse_qs(video_info_webpage)
810 if 'token' in video_info:
812 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
813 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
815 self.report_information_extraction(video_id)
818 if 'token' not in video_info:
819 # Attempt to see if YouTube has issued an error message
820 if 'reason' not in video_info:
821 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
822 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
823 stream.write(video_info_webpage)
826 reason = urllib.unquote_plus(video_info['reason'][0])
827 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
829 token = urllib.unquote_plus(video_info['token'][0])
830 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
831 if format_param is not None:
832 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
834 # Check possible RTMP download
835 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
836 self.report_rtmp_download()
837 video_real_url = video_info['conn'][0]
840 if 'author' not in video_info:
841 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
843 video_uploader = urllib.unquote_plus(video_info['author'][0])
846 if 'title' not in video_info:
847 self._downloader.trouble(u'ERROR: unable to extract video title')
849 video_title = urllib.unquote_plus(video_info['title'][0])
850 video_title = video_title.decode('utf-8')
851 video_title = sanitize_title(video_title)
854 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
855 simple_title = simple_title.strip(ur'_')
858 if 'thumbnail_url' not in video_info:
859 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
861 else: # don't panic if we can't find it
862 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
864 # get video description
865 video_description = 'No description available.' # we need something to pass to self._downloader
866 # this requires an additional HTTP request and a little
867 # more time, so don't do it unless absolutely necessary
868 if self._downloader.params.get('forcedescription', False):
869 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
870 request = urllib2.Request(video_page_url, None, std_headers)
872 video_page_webpage = urllib2.urlopen(request).read()
873 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
875 video_description = mobj.group(1)
876 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
877 pass # don't panic if we can't find it
880 # Process video information
881 self._downloader.process_info({
882 'id': video_id.decode('utf-8'),
883 'url': video_real_url.decode('utf-8'),
884 'uploader': video_uploader.decode('utf-8'),
885 'title': video_title,
886 'stitle': simple_title,
887 'ext': video_extension.decode('utf-8'),
888 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
889 'thumbnail': video_thumbnail.decode('utf-8'),
890 'description': video_description.decode('utf-8'),
894 if quality_index == len(self._available_formats) - 1:
899 format_param = self._available_formats[quality_index]
900 if format_param == None:
906 except UnavailableFormatError, err:
907 if best_quality or all_formats:
908 if quality_index == len(self._available_formats) - 1:
909 # I don't ever expect this to happen
911 self._downloader.trouble(u'ERROR: no known formats available for video')
914 self.report_unavailable_format(video_id, format_param)
916 format_param = self._available_formats[quality_index]
917 if format_param == None:
921 self._downloader.trouble('ERROR: format not available for video')
925 class MetacafeIE(InfoExtractor):
926 """Information Extractor for metacafe.com."""
928 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
929 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
930 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
933 def __init__(self, youtube_ie, downloader=None):
934 InfoExtractor.__init__(self, downloader)
935 self._youtube_ie = youtube_ie
939 return (re.match(MetacafeIE._VALID_URL, url) is not None)
941 def report_disclaimer(self):
942 """Report disclaimer retrieval."""
943 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
945 def report_age_confirmation(self):
946 """Report attempt to confirm age."""
947 self._downloader.to_stdout(u'[metacafe] Confirming age')
949 def report_download_webpage(self, video_id):
950 """Report webpage download."""
951 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
953 def report_extraction(self, video_id):
954 """Report information extraction."""
955 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
957 def _real_initialize(self):
958 # Retrieve disclaimer
959 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
961 self.report_disclaimer()
962 disclaimer = urllib2.urlopen(request).read()
963 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
964 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
970 'submit': "Continue - I'm over 18",
972 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
974 self.report_age_confirmation()
975 disclaimer = urllib2.urlopen(request).read()
976 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
977 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
980 def _real_extract(self, url):
981 # Extract id and simplified title from URL
982 mobj = re.match(self._VALID_URL, url)
984 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
987 video_id = mobj.group(1)
989 # Check if video comes from YouTube
990 mobj2 = re.match(r'^yt-(.*)$', video_id)
991 if mobj2 is not None:
992 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
995 simple_title = mobj.group(2).decode('utf-8')
996 video_extension = 'flv'
998 # Retrieve video webpage to extract further information
999 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1001 self.report_download_webpage(video_id)
1002 webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1007 # Extract URL, uploader and title from webpage
1008 self.report_extraction(video_id)
1009 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1011 self._downloader.trouble(u'ERROR: unable to extract media URL')
1013 mediaURL = urllib.unquote(mobj.group(1))
1015 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1017 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1019 #gdaKey = mobj.group(1)
1021 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1023 video_url = mediaURL
1025 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1027 self._downloader.trouble(u'ERROR: unable to extract title')
1029 video_title = mobj.group(1).decode('utf-8')
1030 video_title = sanitize_title(video_title)
1032 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1034 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1036 video_uploader = mobj.group(1)
1039 # Process video information
1040 self._downloader.process_info({
1041 'id': video_id.decode('utf-8'),
1042 'url': video_url.decode('utf-8'),
1043 'uploader': video_uploader.decode('utf-8'),
1044 'title': video_title,
1045 'stitle': simple_title,
1046 'ext': video_extension.decode('utf-8'),
1049 except UnavailableFormatError:
1050 self._downloader.trouble(u'ERROR: format not available for video')
1053 class GoogleIE(InfoExtractor):
1054 """Information extractor for video.google.com."""
1056 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1058 def __init__(self, downloader=None):
1059 InfoExtractor.__init__(self, downloader)
1063 return (re.match(GoogleIE._VALID_URL, url) is not None)
1065 def report_download_webpage(self, video_id):
1066 """Report webpage download."""
1067 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1069 def report_extraction(self, video_id):
1070 """Report information extraction."""
1071 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1073 def _real_initialize(self):
1076 def _real_extract(self, url):
1077 # Extract id from URL
1078 mobj = re.match(self._VALID_URL, url)
1080 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1083 video_id = mobj.group(1)
1085 video_extension = 'mp4'
1087 # Retrieve video webpage to extract further information
1088 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1090 self.report_download_webpage(video_id)
1091 webpage = urllib2.urlopen(request).read()
1092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1093 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1096 # Extract URL, uploader, and title from webpage
1097 self.report_extraction(video_id)
1098 mobj = re.search(r"download_url:'([^']+)'", webpage)
1100 video_extension = 'flv'
1101 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1103 self._downloader.trouble(u'ERROR: unable to extract media URL')
1105 mediaURL = urllib.unquote(mobj.group(1))
1106 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1107 mediaURL = mediaURL.replace('\\x26', '\x26')
1109 video_url = mediaURL
1111 mobj = re.search(r'<title>(.*)</title>', webpage)
1113 self._downloader.trouble(u'ERROR: unable to extract title')
1115 video_title = mobj.group(1).decode('utf-8')
1116 video_title = sanitize_title(video_title)
1117 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1119 # Extract video description
1120 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1122 self._downloader.trouble(u'ERROR: unable to extract video description')
1124 video_description = mobj.group(1).decode('utf-8')
1125 if not video_description:
1126 video_description = 'No description available.'
1128 # Extract video thumbnail
1129 if self._downloader.params.get('forcethumbnail', False):
1130 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1132 webpage = urllib2.urlopen(request).read()
1133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1136 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1138 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1140 video_thumbnail = mobj.group(1)
1141 else: # we need something to pass to process_info
1142 video_thumbnail = ''
1146 # Process video information
1147 self._downloader.process_info({
1148 'id': video_id.decode('utf-8'),
1149 'url': video_url.decode('utf-8'),
1151 'title': video_title,
1152 'stitle': simple_title,
1153 'ext': video_extension.decode('utf-8'),
1156 except UnavailableFormatError:
1157 self._downloader.trouble(u'ERROR: format not available for video')
1160 class PhotobucketIE(InfoExtractor):
1161 """Information extractor for photobucket.com."""
1163 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1165 def __init__(self, downloader=None):
1166 InfoExtractor.__init__(self, downloader)
1170 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1172 def report_download_webpage(self, video_id):
1173 """Report webpage download."""
1174 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1176 def report_extraction(self, video_id):
1177 """Report information extraction."""
1178 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1180 def _real_initialize(self):
1183 def _real_extract(self, url):
1184 # Extract id from URL
1185 mobj = re.match(self._VALID_URL, url)
1187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 video_id = mobj.group(1)
1192 video_extension = 'flv'
1194 # Retrieve video webpage to extract further information
1195 request = urllib2.Request(url)
1197 self.report_download_webpage(video_id)
1198 webpage = urllib2.urlopen(request).read()
1199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1200 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1203 # Extract URL, uploader, and title from webpage
1204 self.report_extraction(video_id)
1205 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1207 self._downloader.trouble(u'ERROR: unable to extract media URL')
1209 mediaURL = urllib.unquote(mobj.group(1))
1211 video_url = mediaURL
1213 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1215 self._downloader.trouble(u'ERROR: unable to extract title')
1217 video_title = mobj.group(1).decode('utf-8')
1218 video_title = sanitize_title(video_title)
1219 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1221 video_uploader = mobj.group(2).decode('utf-8')
1224 # Process video information
1225 self._downloader.process_info({
1226 'id': video_id.decode('utf-8'),
1227 'url': video_url.decode('utf-8'),
1228 'uploader': video_uploader,
1229 'title': video_title,
1230 'stitle': simple_title,
1231 'ext': video_extension.decode('utf-8'),
1234 except UnavailableFormatError:
1235 self._downloader.trouble(u'ERROR: format not available for video')
1238 class YahooIE(InfoExtractor):
1239 """Information extractor for video.yahoo.com."""
1241 # _VALID_URL matches all Yahoo! Video URLs
1242 # _VPAGE_URL matches only the extractable '/watch/' URLs
1243 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1244 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1246 def __init__(self, downloader=None):
1247 InfoExtractor.__init__(self, downloader)
1251 return (re.match(YahooIE._VALID_URL, url) is not None)
1253 def report_download_webpage(self, video_id):
1254 """Report webpage download."""
1255 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1257 def report_extraction(self, video_id):
1258 """Report information extraction."""
1259 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1261 def _real_initialize(self):
1264 def _real_extract(self, url):
1265 # Extract ID from URL
1266 mobj = re.match(self._VALID_URL, url)
1268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1271 video_id = mobj.group(2)
1272 video_extension = 'flv'
1274 # Rewrite valid but non-extractable URLs as
1275 # extractable English language /watch/ URLs
1276 if re.match(self._VPAGE_URL, url) is None:
1277 request = urllib2.Request(url)
1279 webpage = urllib2.urlopen(request).read()
1280 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1284 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1286 self._downloader.trouble(u'ERROR: Unable to extract id field')
1288 yahoo_id = mobj.group(1)
1290 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1292 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1294 yahoo_vid = mobj.group(1)
1296 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1297 return self._real_extract(url)
1299 # Retrieve video webpage to extract further information
1300 request = urllib2.Request(url)
1302 self.report_download_webpage(video_id)
1303 webpage = urllib2.urlopen(request).read()
1304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1305 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1308 # Extract uploader and title from webpage
1309 self.report_extraction(video_id)
1310 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1312 self._downloader.trouble(u'ERROR: unable to extract video title')
1314 video_title = mobj.group(1).decode('utf-8')
1315 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1317 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1319 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1321 video_uploader = mobj.group(1).decode('utf-8')
1323 # Extract video thumbnail
1324 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1326 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1328 video_thumbnail = mobj.group(1).decode('utf-8')
1330 # Extract video description
1331 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1333 self._downloader.trouble(u'ERROR: unable to extract video description')
1335 video_description = mobj.group(1).decode('utf-8')
1336 if not video_description: video_description = 'No description available.'
1338 # Extract video height and width
1339 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1341 self._downloader.trouble(u'ERROR: unable to extract video height')
1343 yv_video_height = mobj.group(1)
1345 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1347 self._downloader.trouble(u'ERROR: unable to extract video width')
1349 yv_video_width = mobj.group(1)
1351 # Retrieve video playlist to extract media URL
1352 # I'm not completely sure what all these options are, but we
1353 # seem to need most of them, otherwise the server sends a 401.
1354 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1355 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1356 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1357 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1358 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1360 self.report_download_webpage(video_id)
1361 webpage = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1366 # Extract media URL from playlist XML
1367 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1369 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1371 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1372 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1375 # Process video information
1376 self._downloader.process_info({
1377 'id': video_id.decode('utf-8'),
1379 'uploader': video_uploader,
1380 'title': video_title,
1381 'stitle': simple_title,
1382 'ext': video_extension.decode('utf-8'),
1383 'thumbnail': video_thumbnail.decode('utf-8'),
1384 'description': video_description,
1385 'thumbnail': video_thumbnail,
1386 'description': video_description,
1388 except UnavailableFormatError:
1389 self._downloader.trouble(u'ERROR: format not available for video')
1392 class GenericIE(InfoExtractor):
1393 """Generic last-resort information extractor."""
1395 def __init__(self, downloader=None):
1396 InfoExtractor.__init__(self, downloader)
1402 def report_download_webpage(self, video_id):
1403 """Report webpage download."""
1404 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1405 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1407 def report_extraction(self, video_id):
1408 """Report information extraction."""
1409 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1411 def _real_initialize(self):
1414 def _real_extract(self, url):
1415 video_id = url.split('/')[-1]
1416 request = urllib2.Request(url)
1418 self.report_download_webpage(video_id)
1419 webpage = urllib2.urlopen(request).read()
1420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1423 except ValueError, err:
1424 # since this is the last-resort InfoExtractor, if
1425 # this error is thrown, it'll be thrown here
1426 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1429 # Start with something easy: JW Player in SWFObject
1430 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1432 # Broaden the search a little bit
1433 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1435 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1438 # It's possible that one of the regexes
1439 # matched, but returned an empty group:
1440 if mobj.group(1) is None:
1441 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1444 video_url = urllib.unquote(mobj.group(1))
1445 video_id = os.path.basename(video_url)
1447 # here's a fun little line of code for you:
1448 video_extension = os.path.splitext(video_id)[1][1:]
1449 video_id = os.path.splitext(video_id)[0]
1451 # it's tempting to parse this further, but you would
1452 # have to take into account all the variations like
1453 # Video Title - Site Name
1454 # Site Name | Video Title
1455 # Video Title - Tagline | Site Name
1456 # and so on and so forth; it's just not practical
1457 mobj = re.search(r'<title>(.*)</title>', webpage)
1459 self._downloader.trouble(u'ERROR: unable to extract title')
1461 video_title = mobj.group(1).decode('utf-8')
1462 video_title = sanitize_title(video_title)
1463 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1465 # video uploader is domain name
1466 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1468 self._downloader.trouble(u'ERROR: unable to extract title')
1470 video_uploader = mobj.group(1).decode('utf-8')
1473 # Process video information
1474 self._downloader.process_info({
1475 'id': video_id.decode('utf-8'),
1476 'url': video_url.decode('utf-8'),
1477 'uploader': video_uploader,
1478 'title': video_title,
1479 'stitle': simple_title,
1480 'ext': video_extension.decode('utf-8'),
1483 except UnavailableFormatError:
1484 self._downloader.trouble(u'ERROR: format not available for video')
1487 class YoutubeSearchIE(InfoExtractor):
1488 """Information Extractor for YouTube search queries."""
1489 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1490 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1491 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1492 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1494 _max_youtube_results = 1000
1496 def __init__(self, youtube_ie, downloader=None):
1497 InfoExtractor.__init__(self, downloader)
1498 self._youtube_ie = youtube_ie
1502 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1504 def report_download_page(self, query, pagenum):
1505 """Report attempt to download playlist page with given number."""
1506 query = query.decode(preferredencoding())
1507 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1509 def _real_initialize(self):
1510 self._youtube_ie.initialize()
1512 def _real_extract(self, query):
1513 mobj = re.match(self._VALID_QUERY, query)
1515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1518 prefix, query = query.split(':')
1520 query = query.encode('utf-8')
1522 self._download_n_results(query, 1)
1524 elif prefix == 'all':
1525 self._download_n_results(query, self._max_youtube_results)
1531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1533 elif n > self._max_youtube_results:
1534 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1535 n = self._max_youtube_results
1536 self._download_n_results(query, n)
1538 except ValueError: # parsing prefix as integer fails
1539 self._download_n_results(query, 1)
1542 def _download_n_results(self, query, n):
1543 """Downloads a specified number of results for a query"""
1546 already_seen = set()
1550 self.report_download_page(query, pagenum)
1551 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1552 request = urllib2.Request(result_url, None, std_headers)
1554 page = urllib2.urlopen(request).read()
1555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1559 # Extract video identifiers
1560 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1561 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1562 if video_id not in already_seen:
1563 video_ids.append(video_id)
1564 already_seen.add(video_id)
1565 if len(video_ids) == n:
1566 # Specified n videos reached
1567 for id in video_ids:
1568 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1572 for id in video_ids:
1573 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1576 pagenum = pagenum + 1
1578 class GoogleSearchIE(InfoExtractor):
1579 """Information Extractor for Google Video search queries."""
1580 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1581 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1582 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1583 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1585 _max_google_results = 1000
1587 def __init__(self, google_ie, downloader=None):
1588 InfoExtractor.__init__(self, downloader)
1589 self._google_ie = google_ie
1593 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1595 def report_download_page(self, query, pagenum):
1596 """Report attempt to download playlist page with given number."""
1597 query = query.decode(preferredencoding())
1598 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1600 def _real_initialize(self):
1601 self._google_ie.initialize()
1603 def _real_extract(self, query):
1604 mobj = re.match(self._VALID_QUERY, query)
1606 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1609 prefix, query = query.split(':')
1611 query = query.encode('utf-8')
1613 self._download_n_results(query, 1)
1615 elif prefix == 'all':
1616 self._download_n_results(query, self._max_google_results)
1622 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624 elif n > self._max_google_results:
1625 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1626 n = self._max_google_results
1627 self._download_n_results(query, n)
1629 except ValueError: # parsing prefix as integer fails
1630 self._download_n_results(query, 1)
1633 def _download_n_results(self, query, n):
1634 """Downloads a specified number of results for a query"""
1637 already_seen = set()
1641 self.report_download_page(query, pagenum)
1642 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1643 request = urllib2.Request(result_url, None, std_headers)
1645 page = urllib2.urlopen(request).read()
1646 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1650 # Extract video identifiers
1651 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652 video_id = mobj.group(1)
1653 if video_id not in already_seen:
1654 video_ids.append(video_id)
1655 already_seen.add(video_id)
1656 if len(video_ids) == n:
1657 # Specified n videos reached
1658 for id in video_ids:
1659 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1662 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663 for id in video_ids:
1664 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1667 pagenum = pagenum + 1
1669 class YahooSearchIE(InfoExtractor):
1670 """Information Extractor for Yahoo! Video search queries."""
1671 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1672 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1673 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1674 _MORE_PAGES_INDICATOR = r'\s*Next'
1676 _max_yahoo_results = 1000
1678 def __init__(self, yahoo_ie, downloader=None):
1679 InfoExtractor.__init__(self, downloader)
1680 self._yahoo_ie = yahoo_ie
1684 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1686 def report_download_page(self, query, pagenum):
1687 """Report attempt to download playlist page with given number."""
1688 query = query.decode(preferredencoding())
1689 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1691 def _real_initialize(self):
1692 self._yahoo_ie.initialize()
1694 def _real_extract(self, query):
1695 mobj = re.match(self._VALID_QUERY, query)
1697 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1700 prefix, query = query.split(':')
1702 query = query.encode('utf-8')
1704 self._download_n_results(query, 1)
1706 elif prefix == 'all':
1707 self._download_n_results(query, self._max_yahoo_results)
1713 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1715 elif n > self._max_yahoo_results:
1716 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1717 n = self._max_yahoo_results
1718 self._download_n_results(query, n)
1720 except ValueError: # parsing prefix as integer fails
1721 self._download_n_results(query, 1)
1724 def _download_n_results(self, query, n):
1725 """Downloads a specified number of results for a query"""
1728 already_seen = set()
1732 self.report_download_page(query, pagenum)
1733 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1734 request = urllib2.Request(result_url, None, std_headers)
1736 page = urllib2.urlopen(request).read()
1737 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1738 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1741 # Extract video identifiers
1742 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1743 video_id = mobj.group(1)
1744 if video_id not in already_seen:
1745 video_ids.append(video_id)
1746 already_seen.add(video_id)
1747 if len(video_ids) == n:
1748 # Specified n videos reached
1749 for id in video_ids:
1750 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1753 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1754 for id in video_ids:
1755 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1758 pagenum = pagenum + 1
1760 class YoutubePlaylistIE(InfoExtractor):
1761 """Information Extractor for YouTube playlists."""
1763 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1764 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1765 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1766 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1769 def __init__(self, youtube_ie, downloader=None):
1770 InfoExtractor.__init__(self, downloader)
1771 self._youtube_ie = youtube_ie
1775 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1777 def report_download_page(self, playlist_id, pagenum):
1778 """Report attempt to download playlist page with given number."""
1779 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1781 def _real_initialize(self):
1782 self._youtube_ie.initialize()
1784 def _real_extract(self, url):
1785 # Extract playlist id
1786 mobj = re.match(self._VALID_URL, url)
1788 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1791 # Download playlist pages
1792 playlist_id = mobj.group(1)
1797 self.report_download_page(playlist_id, pagenum)
1798 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1800 page = urllib2.urlopen(request).read()
1801 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1802 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1805 # Extract video identifiers
1807 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1808 if mobj.group(1) not in ids_in_page:
1809 ids_in_page.append(mobj.group(1))
1810 video_ids.extend(ids_in_page)
1812 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1814 pagenum = pagenum + 1
1816 for id in video_ids:
1817 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1820 class YoutubeUserIE(InfoExtractor):
1821 """Information Extractor for YouTube users."""
1823 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1824 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1825 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1828 def __init__(self, youtube_ie, downloader=None):
1829 InfoExtractor.__init__(self, downloader)
1830 self._youtube_ie = youtube_ie
1834 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1836 def report_download_page(self, username):
1837 """Report attempt to download user page."""
1838 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1840 def _real_initialize(self):
1841 self._youtube_ie.initialize()
1843 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1850 # Download user page
1851 username = mobj.group(1)
1855 self.report_download_page(username)
1856 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1858 page = urllib2.urlopen(request).read()
1859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1860 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1863 # Extract video identifiers
1866 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867 if mobj.group(1) not in ids_in_page:
1868 ids_in_page.append(mobj.group(1))
1869 video_ids.extend(ids_in_page)
1871 for id in video_ids:
1872 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1875 class PostProcessor(object):
1876 """Post Processor class.
1878 PostProcessor objects can be added to downloaders with their
1879 add_post_processor() method. When the downloader has finished a
1880 successful download, it will take its internal chain of PostProcessors
1881 and start calling the run() method on each one of them, first with
1882 an initial argument and then with the returned value of the previous
1885 The chain will be stopped if one of them ever returns None or the end
1886 of the chain is reached.
1888 PostProcessor objects follow a "mutual registration" process similar
1889 to InfoExtractor objects.
1894 def __init__(self, downloader=None):
1895 self._downloader = downloader
1897 def set_downloader(self, downloader):
1898 """Sets the downloader for this PP."""
1899 self._downloader = downloader
1901 def run(self, information):
1902 """Run the PostProcessor.
1904 The "information" argument is a dictionary like the ones
1905 composed by InfoExtractors. The only difference is that this
1906 one has an extra field called "filepath" that points to the
1909 When this method returns None, the postprocessing chain is
1910 stopped. However, this method may return an information
1911 dictionary that will be passed to the next postprocessing
1912 object in the chain. It can be the one it received after
1913 changing some fields.
1915 In addition, this method may raise a PostProcessingError
1916 exception that will be taken into account by the downloader
1919 return information # by default, do nothing
1921 ### MAIN PROGRAM ###
1922 if __name__ == '__main__':
1924 # Modules needed only when running the main program
1928 # Function to update the program file with the latest version from bitbucket.org
1929 def update_self(downloader, filename):
1930 # Note: downloader only used for options
1931 if not os.access (filename, os.W_OK):
1932 sys.exit('ERROR: no write permissions on %s' % filename)
1934 downloader.to_stdout('Updating to latest stable version...')
1935 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1936 latest_version = urllib.urlopen(latest_url).read().strip()
1937 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1938 newcontent = urllib.urlopen(prog_url).read()
1939 stream = open(filename, 'w')
1940 stream.write(newcontent)
1942 downloader.to_stdout('Updated to version %s' % latest_version)
1944 # General configuration
1945 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1946 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1947 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1949 # Parse command line
1950 parser = optparse.OptionParser(
1951 usage='Usage: %prog [options] url...',
1952 version='2010.04.04',
1953 conflict_handler='resolve',
1956 parser.add_option('-h', '--help',
1957 action='help', help='print this help text and exit')
1958 parser.add_option('-v', '--version',
1959 action='version', help='print program version and exit')
1960 parser.add_option('-U', '--update',
1961 action='store_true', dest='update_self', help='update this program to latest stable version')
1962 parser.add_option('-i', '--ignore-errors',
1963 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1964 parser.add_option('-r', '--rate-limit',
1965 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1967 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1968 authentication.add_option('-u', '--username',
1969 dest='username', metavar='UN', help='account username')
1970 authentication.add_option('-p', '--password',
1971 dest='password', metavar='PW', help='account password')
1972 authentication.add_option('-n', '--netrc',
1973 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1974 parser.add_option_group(authentication)
1976 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1977 video_format.add_option('-f', '--format',
1978 action='store', dest='format', metavar='FMT', help='video format code')
1979 video_format.add_option('-b', '--best-quality',
1980 action='store_const', dest='format', help='download the best quality video possible', const='0')
1981 video_format.add_option('-m', '--mobile-version',
1982 action='store_const', dest='format', help='alias for -f 17', const='17')
1983 video_format.add_option('-d', '--high-def',
1984 action='store_const', dest='format', help='alias for -f 22', const='22')
1985 video_format.add_option('--all-formats',
1986 action='store_const', dest='format', help='download all available video formats', const='-1')
1987 parser.add_option_group(video_format)
1989 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1990 verbosity.add_option('-q', '--quiet',
1991 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1992 verbosity.add_option('-s', '--simulate',
1993 action='store_true', dest='simulate', help='do not download video', default=False)
1994 verbosity.add_option('-g', '--get-url',
1995 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1996 verbosity.add_option('-e', '--get-title',
1997 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1998 verbosity.add_option('--get-thumbnail',
1999 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2000 verbosity.add_option('--get-description',
2001 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2002 verbosity.add_option('--no-progress',
2003 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2004 parser.add_option_group(verbosity)
2006 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2007 filesystem.add_option('-t', '--title',
2008 action='store_true', dest='usetitle', help='use title in file name', default=False)
2009 filesystem.add_option('-l', '--literal',
2010 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2011 filesystem.add_option('-o', '--output',
2012 dest='outtmpl', metavar='TPL', help='output filename template')
2013 filesystem.add_option('-a', '--batch-file',
2014 dest='batchfile', metavar='F', help='file containing URLs to download')
2015 filesystem.add_option('-w', '--no-overwrites',
2016 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2017 filesystem.add_option('-c', '--continue',
2018 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2019 parser.add_option_group(filesystem)
2021 (opts, args) = parser.parse_args()
2023 # Batch file verification
2025 if opts.batchfile is not None:
2027 batchurls = open(opts.batchfile, 'r').readlines()
2028 batchurls = [x.strip() for x in batchurls]
2029 batchurls = [x for x in batchurls if len(x) > 0]
2031 sys.exit(u'ERROR: batch file could not be read')
2032 all_urls = batchurls + args
2034 # Conflicting, missing and erroneous options
2035 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2036 parser.error(u'using .netrc conflicts with giving username/password')
2037 if opts.password is not None and opts.username is None:
2038 parser.error(u'account username missing')
2039 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2040 parser.error(u'using output template conflicts with using title or literal title')
2041 if opts.usetitle and opts.useliteral:
2042 parser.error(u'using title conflicts with using literal title')
2043 if opts.username is not None and opts.password is None:
2044 opts.password = getpass.getpass(u'Type account password and press return:')
2045 if opts.ratelimit is not None:
2046 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2047 if numeric_limit is None:
2048 parser.error(u'invalid rate limit specified')
2049 opts.ratelimit = numeric_limit
2051 # Information extractors
2052 youtube_ie = YoutubeIE()
2053 metacafe_ie = MetacafeIE(youtube_ie)
2054 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2055 youtube_user_ie = YoutubeUserIE(youtube_ie)
2056 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2057 google_ie = GoogleIE()
2058 google_search_ie = GoogleSearchIE(google_ie)
2059 photobucket_ie = PhotobucketIE()
2060 yahoo_ie = YahooIE()
2061 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2062 generic_ie = GenericIE()
2065 fd = FileDownloader({
2066 'usenetrc': opts.usenetrc,
2067 'username': opts.username,
2068 'password': opts.password,
2069 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2070 'forceurl': opts.geturl,
2071 'forcetitle': opts.gettitle,
2072 'forcethumbnail': opts.getthumbnail,
2073 'forcedescription': opts.getdescription,
2074 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2075 'format': opts.format,
2076 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2077 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2078 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2079 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2080 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2081 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2082 or u'%(id)s.%(ext)s'),
2083 'ignoreerrors': opts.ignoreerrors,
2084 'ratelimit': opts.ratelimit,
2085 'nooverwrites': opts.nooverwrites,
2086 'continuedl': opts.continue_dl,
2087 'noprogress': opts.noprogress,
2089 fd.add_info_extractor(youtube_search_ie)
2090 fd.add_info_extractor(youtube_pl_ie)
2091 fd.add_info_extractor(youtube_user_ie)
2092 fd.add_info_extractor(metacafe_ie)
2093 fd.add_info_extractor(youtube_ie)
2094 fd.add_info_extractor(google_ie)
2095 fd.add_info_extractor(google_search_ie)
2096 fd.add_info_extractor(photobucket_ie)
2097 fd.add_info_extractor(yahoo_ie)
2098 fd.add_info_extractor(yahoo_search_ie)
2100 # This must come last since it's the
2101 # fallback if none of the others work
2102 fd.add_info_extractor(generic_ie)
2105 if opts.update_self:
2106 update_self(fd, sys.argv[0])
2109 if len(all_urls) < 1:
2110 if not opts.update_self:
2111 parser.error(u'you must provide at least one URL')
2114 retcode = fd.download(all_urls)
2117 except DownloadError:
2119 except SameFileError:
2120 sys.exit(u'ERROR: fixed output name but more than one file to download')
2121 except KeyboardInterrupt:
2122 sys.exit(u'\nERROR: Interrupted by user')