2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse import parse_qs
27 from cgi import parse_qs
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref = locale.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity = matchobj.group(1)
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
72 numstr = u'0%s' % numstr
75 return unichr(long(numstr, base))
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 continuedl: Try to continue downloads if possible.
197 noprogress: Do not print the progress bar.
203 _download_retcode = None
204 _num_downloads = None
206 def __init__(self, params):
207 """Create a FileDownloader object with the given options."""
210 self._download_retcode = 0
211 self._num_downloads = 0
215 def pmkdir(filename):
216 """Create directory components in filename. Similar to Unix "mkdir -p"."""
217 components = filename.split(os.sep)
218 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
219 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
220 for dir in aggregate:
221 if not os.path.exists(dir):
225 def format_bytes(bytes):
228 if type(bytes) is str:
233 exponent = long(math.log(bytes, 1024.0))
234 suffix = 'bkMGTPEZY'[exponent]
235 converted = float(bytes) / float(1024**exponent)
236 return '%.2f%s' % (converted, suffix)
239 def calc_percent(byte_counter, data_len):
242 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245 def calc_eta(start, now, total, current):
249 if current == 0 or dif < 0.001: # One millisecond
251 rate = float(current) / dif
252 eta = long((float(total) - float(current)) / rate)
253 (eta_mins, eta_secs) = divmod(eta, 60)
256 return '%02d:%02d' % (eta_mins, eta_secs)
259 def calc_speed(start, now, bytes):
261 if bytes == 0 or dif < 0.001: # One millisecond
262 return '%10s' % '---b/s'
263 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266 def best_block_size(elapsed_time, bytes):
267 new_min = max(bytes / 2.0, 1.0)
268 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
269 if elapsed_time < 0.001:
271 rate = bytes / elapsed_time
279 def parse_bytes(bytestr):
280 """Parse a string indicating a byte quantity into a long integer."""
281 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 number = float(matchobj.group(1))
285 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
286 return long(round(number * multiplier))
290 """Verify a URL is valid and data could be downloaded. Return real data URL."""
291 request = urllib2.Request(url, None, std_headers)
292 data = urllib2.urlopen(request)
298 def add_info_extractor(self, ie):
299 """Add an InfoExtractor object to the end of the list."""
301 ie.set_downloader(self)
303 def add_post_processor(self, pp):
304 """Add a PostProcessor object to the end of the chain."""
306 pp.set_downloader(self)
308 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
309 """Print message to stdout if not in quiet mode."""
311 if not self.params.get('quiet', False):
312 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314 except (UnicodeEncodeError), err:
315 if not ignore_encoding_errors:
318 def to_stderr(self, message):
319 """Print message to stderr."""
320 print >>sys.stderr, message.encode(preferredencoding())
322 def fixed_template(self):
323 """Checks if the output template is fixed."""
324 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326 def trouble(self, message=None):
327 """Determine action to take when a download problem appears.
329 Depending on if the downloader has been configured to ignore
330 download errors or not, this method may throw an exception or
331 not when errors are found, after printing the message.
333 if message is not None:
334 self.to_stderr(message)
335 if not self.params.get('ignoreerrors', False):
336 raise DownloadError(message)
337 self._download_retcode = 1
339 def slow_down(self, start_time, byte_counter):
340 """Sleep if the download speed is over the rate limit."""
341 rate_limit = self.params.get('ratelimit', None)
342 if rate_limit is None or byte_counter == 0:
345 elapsed = now - start_time
348 speed = float(byte_counter) / elapsed
349 if speed > rate_limit:
350 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352 def report_destination(self, filename):
353 """Report destination filename."""
354 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
357 """Report download progress."""
358 if self.params.get('noprogress', False):
360 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
361 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363 def report_resuming_byte(self, resume_len):
364 """Report attemtp to resume at given byte."""
365 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367 def report_file_already_downloaded(self, file_name):
368 """Report file has already been fully downloaded."""
370 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
371 except (UnicodeEncodeError), err:
372 self.to_stdout(u'[download] The file has already been downloaded')
374 def report_unable_to_resume(self):
375 """Report it was impossible to resume download."""
376 self.to_stdout(u'[download] Unable to resume')
378 def report_finish(self):
379 """Report download finished."""
380 if self.params.get('noprogress', False):
381 self.to_stdout(u'[download] Download completed')
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Verify URL if it's an HTTP one
390 if info_dict['url'].startswith('http'):
392 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
393 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise UnavailableFormatError
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
402 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
403 if self.params.get('forcedescription', False) and 'description' in info_dict:
404 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
409 template_dict = dict(info_dict)
410 template_dict['epoch'] = unicode(long(time.time()))
411 template_dict['ord'] = unicode('%05d' % self._num_downloads)
412 filename = self.params['outtmpl'] % template_dict
413 except (ValueError, KeyError), err:
414 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
415 if self.params.get('nooverwrites', False) and os.path.exists(filename):
416 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
420 self.pmkdir(filename)
421 except (OSError, IOError), err:
422 self.trouble('ERROR: unable to create directories: %s' % str(err))
426 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
427 except (OSError, IOError), err:
428 raise UnavailableFormatError
429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
430 self.trouble('ERROR: unable to download video data: %s' % str(err))
432 except (ContentTooShortError, ), err:
433 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
438 self.post_process(filename, info_dict)
439 except (PostProcessingError), err:
440 self.trouble('ERROR: postprocessing: %s' % str(err))
443 def download(self, url_list):
444 """Download a given list of URLs."""
445 if len(url_list) > 1 and self.fixed_template():
446 raise SameFileError(self.params['outtmpl'])
449 suitable_found = False
451 # Go to next InfoExtractor if not suitable
452 if not ie.suitable(url):
455 # Suitable InfoExtractor found
456 suitable_found = True
458 # Extract information from URL and process it
461 # Suitable InfoExtractor had been found; go to next URL
464 if not suitable_found:
465 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
467 return self._download_retcode
469 def post_process(self, filename, ie_info):
470 """Run the postprocessing chain on the given file."""
472 info['filepath'] = filename
478 def _download_with_rtmpdump(self, filename, url):
479 self.report_destination(filename)
481 # Check for rtmpdump first
483 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
484 except (OSError, IOError):
485 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
488 # Download using rtmpdump. rtmpdump returns exit code 2 when
489 # the connection was interrumpted and resuming appears to be
490 # possible. This is part of rtmpdump's normal usage, AFAIK.
491 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
492 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
493 while retval == 2 or retval == 1:
494 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
495 time.sleep(2.0) # This seems to be needed
496 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
498 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
501 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
504 def _do_download(self, filename, url):
505 # Attempt to download using rtmpdump
506 if url.startswith('rtmp'):
507 return self._download_with_rtmpdump(filename, url)
511 basic_request = urllib2.Request(url, None, std_headers)
512 request = urllib2.Request(url, None, std_headers)
514 # Establish possible resume length
515 if os.path.isfile(filename):
516 resume_len = os.path.getsize(filename)
520 # Request parameters in case of being able to resume
521 if self.params.get('continuedl', False) and resume_len != 0:
522 self.report_resuming_byte(resume_len)
523 request.add_header('Range','bytes=%d-' % resume_len)
526 # Establish connection
528 data = urllib2.urlopen(request)
529 except (urllib2.HTTPError, ), err:
530 if err.code != 416: # 416 is 'Requested range not satisfiable'
533 data = urllib2.urlopen(basic_request)
534 content_length = data.info()['Content-Length']
536 if content_length is not None and long(content_length) == resume_len:
537 # Because the file had already been fully downloaded
538 self.report_file_already_downloaded(filename)
541 # Because the server didn't let us
542 self.report_unable_to_resume()
545 data_len = data.info().get('Content-length', None)
546 data_len_str = self.format_bytes(data_len)
553 data_block = data.read(block_size)
555 data_block_len = len(data_block)
556 if data_block_len == 0:
558 byte_counter += data_block_len
560 # Open file just in time
563 (stream, filename) = sanitize_open(filename, open_mode)
564 self.report_destination(filename)
565 self._num_downloads += 1
566 except (OSError, IOError), err:
567 self.trouble('ERROR: unable to open for writing: %s' % str(err))
569 stream.write(data_block)
570 block_size = self.best_block_size(after - before, data_block_len)
573 percent_str = self.calc_percent(byte_counter, data_len)
574 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
575 speed_str = self.calc_speed(start, time.time(), byte_counter)
576 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
579 self.slow_down(start, byte_counter)
582 if data_len is not None and str(byte_counter) != data_len:
583 raise ContentTooShortError(byte_counter, long(data_len))
586 class InfoExtractor(object):
587 """Information Extractor class.
589 Information extractors are the classes that, given a URL, extract
590 information from the video (or videos) the URL refers to. This
591 information includes the real video URL, the video title and simplified
592 title, author and others. The information is stored in a dictionary
593 which is then passed to the FileDownloader. The FileDownloader
594 processes this information possibly downloading the video to the file
595 system, among other possible outcomes. The dictionaries must include
596 the following fields:
598 id: Video identifier.
599 url: Final video URL.
600 uploader: Nickname of the video uploader.
601 title: Literal title.
602 stitle: Simplified title.
603 ext: Video filename extension.
604 format: Video format.
606 The following fields are optional. Their primary purpose is to allow
607 youtube-dl to serve as the backend for a video search function, such
608 as the one in youtube2mp3. They are only used when their respective
609 forced printing functions are called:
611 thumbnail: Full URL to a video thumbnail image.
612 description: One-line video description.
614 Subclasses of this one should re-define the _real_initialize() and
615 _real_extract() methods, as well as the suitable() static method.
616 Probably, they should also be instantiated and added to the main
623 def __init__(self, downloader=None):
624 """Constructor. Receives an optional downloader."""
626 self.set_downloader(downloader)
630 """Receives a URL and returns True if suitable for this IE."""
633 def initialize(self):
634 """Initializes an instance (authentication, etc)."""
636 self._real_initialize()
639 def extract(self, url):
640 """Extracts URL information and returns it in list of dicts."""
642 return self._real_extract(url)
644 def set_downloader(self, downloader):
645 """Sets the downloader for this IE."""
646 self._downloader = downloader
648 def _real_initialize(self):
649 """Real initialization process. Redefine in subclasses."""
652 def _real_extract(self, url):
653 """Real extraction process. Redefine in subclasses."""
656 class YoutubeIE(InfoExtractor):
657 """Information extractor for youtube.com."""
659 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
660 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
661 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
662 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
663 _NETRC_MACHINE = 'youtube'
664 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
665 _video_extensions = {
675 return (re.match(YoutubeIE._VALID_URL, url) is not None)
677 def report_lang(self):
678 """Report attempt to set language."""
679 self._downloader.to_stdout(u'[youtube] Setting language')
681 def report_login(self):
682 """Report attempt to log in."""
683 self._downloader.to_stdout(u'[youtube] Logging in')
685 def report_age_confirmation(self):
686 """Report attempt to confirm age."""
687 self._downloader.to_stdout(u'[youtube] Confirming age')
689 def report_video_info_webpage_download(self, video_id):
690 """Report attempt to download video info webpage."""
691 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
693 def report_information_extraction(self, video_id):
694 """Report attempt to extract video information."""
695 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
697 def report_unavailable_format(self, video_id, format):
698 """Report extracted video URL."""
699 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
701 def report_rtmp_download(self):
702 """Indicate the download will use the RTMP protocol."""
703 self._downloader.to_stdout(u'[youtube] RTMP download detected')
705 def _real_initialize(self):
706 if self._downloader is None:
711 downloader_params = self._downloader.params
713 # Attempt to use provided username and password or .netrc data
714 if downloader_params.get('username', None) is not None:
715 username = downloader_params['username']
716 password = downloader_params['password']
717 elif downloader_params.get('usenetrc', False):
719 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
724 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
725 except (IOError, netrc.NetrcParseError), err:
726 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
730 request = urllib2.Request(self._LANG_URL, None, std_headers)
733 urllib2.urlopen(request).read()
734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
735 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
738 # No authentication to be performed
744 'current_form': 'loginForm',
746 'action_login': 'Log In',
747 'username': username,
748 'password': password,
750 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
753 login_results = urllib2.urlopen(request).read()
754 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
755 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
757 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
758 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
764 'action_confirm': 'Confirm',
766 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
768 self.report_age_confirmation()
769 age_results = urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
774 def _real_extract(self, url):
775 # Extract video id from URL
776 mobj = re.match(self._VALID_URL, url)
778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
780 video_id = mobj.group(2)
782 # Downloader parameters
787 if self._downloader is not None:
788 params = self._downloader.params
789 format_param = params.get('format', None)
790 if format_param == '0':
791 format_param = self._available_formats[quality_index]
793 elif format_param == '-1':
794 format_param = self._available_formats[quality_index]
799 video_extension = self._video_extensions.get(format_param, 'flv')
802 self.report_video_info_webpage_download(video_id)
803 for el_type in ['embedded', 'detailpage', 'vevo']:
804 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
805 % (video_id, el_type))
806 request = urllib2.Request(video_info_url, None, std_headers)
808 video_info_webpage = urllib2.urlopen(request).read()
809 video_info = parse_qs(video_info_webpage)
810 if 'token' in video_info:
812 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
813 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
815 self.report_information_extraction(video_id)
818 if 'token' not in video_info:
819 # Attempt to see if YouTube has issued an error message
820 if 'reason' not in video_info:
821 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
822 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
823 stream.write(video_info_webpage)
826 reason = urllib.unquote_plus(video_info['reason'][0])
827 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
829 token = urllib.unquote_plus(video_info['token'][0])
830 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
831 if format_param is not None:
832 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
834 # Check possible RTMP download
835 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
836 self.report_rtmp_download()
837 video_real_url = video_info['conn'][0]
840 if 'author' not in video_info:
841 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
843 video_uploader = urllib.unquote_plus(video_info['author'][0])
846 if 'title' not in video_info:
847 self._downloader.trouble(u'ERROR: unable to extract video title')
849 video_title = urllib.unquote_plus(video_info['title'][0])
850 video_title = video_title.decode('utf-8')
851 video_title = sanitize_title(video_title)
854 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
855 simple_title = simple_title.strip(ur'_')
858 if 'thumbnail_url' not in video_info:
859 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
861 else: # don't panic if we can't find it
862 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
864 # get video description
865 video_description = 'No description available.' # we need something to pass to self._downloader
866 # this requires an additional HTTP request and a little
867 # more time, so don't do it unless absolutely necessary
868 if self._downloader.params.get('forcedescription', False):
869 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
870 request = urllib2.Request(video_page_url, None, std_headers)
872 video_page_webpage = urllib2.urlopen(request).read()
873 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
875 video_description = mobj.group(1)
876 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
877 pass # don't panic if we can't find it
880 # Process video information
881 self._downloader.process_info({
882 'id': video_id.decode('utf-8'),
883 'url': video_real_url.decode('utf-8'),
884 'uploader': video_uploader.decode('utf-8'),
885 'title': video_title,
886 'stitle': simple_title,
887 'ext': video_extension.decode('utf-8'),
888 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
889 'thumbnail': video_thumbnail.decode('utf-8'),
890 'description': video_description.decode('utf-8'),
894 if quality_index == len(self._available_formats):
899 format_param = self._available_formats[quality_index]
903 except UnavailableFormatError, err:
904 if best_quality or all_formats:
905 if quality_index == len(self._available_formats):
906 # I don't ever expect this to happen
908 self._downloader.trouble(u'ERROR: no known formats available for video')
911 self.report_unavailable_format(video_id, format_param)
913 format_param = self._available_formats[quality_index]
916 self._downloader.trouble('ERROR: format not available for video')
920 class MetacafeIE(InfoExtractor):
921 """Information Extractor for metacafe.com."""
923 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
924 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
925 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
928 def __init__(self, youtube_ie, downloader=None):
929 InfoExtractor.__init__(self, downloader)
930 self._youtube_ie = youtube_ie
934 return (re.match(MetacafeIE._VALID_URL, url) is not None)
936 def report_disclaimer(self):
937 """Report disclaimer retrieval."""
938 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
940 def report_age_confirmation(self):
941 """Report attempt to confirm age."""
942 self._downloader.to_stdout(u'[metacafe] Confirming age')
944 def report_download_webpage(self, video_id):
945 """Report webpage download."""
946 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
948 def report_extraction(self, video_id):
949 """Report information extraction."""
950 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
952 def _real_initialize(self):
953 # Retrieve disclaimer
954 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
956 self.report_disclaimer()
957 disclaimer = urllib2.urlopen(request).read()
958 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
959 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
965 'submit': "Continue - I'm over 18",
967 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
969 self.report_age_confirmation()
970 disclaimer = urllib2.urlopen(request).read()
971 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
972 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
975 def _real_extract(self, url):
976 # Extract id and simplified title from URL
977 mobj = re.match(self._VALID_URL, url)
979 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
982 video_id = mobj.group(1)
984 # Check if video comes from YouTube
985 mobj2 = re.match(r'^yt-(.*)$', video_id)
986 if mobj2 is not None:
987 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
990 simple_title = mobj.group(2).decode('utf-8')
991 video_extension = 'flv'
993 # Retrieve video webpage to extract further information
994 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
996 self.report_download_webpage(video_id)
997 webpage = urllib2.urlopen(request).read()
998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
999 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1002 # Extract URL, uploader and title from webpage
1003 self.report_extraction(video_id)
1004 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1006 self._downloader.trouble(u'ERROR: unable to extract media URL')
1008 mediaURL = urllib.unquote(mobj.group(1))
1010 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1012 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1014 #gdaKey = mobj.group(1)
1016 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1018 video_url = mediaURL
1020 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1022 self._downloader.trouble(u'ERROR: unable to extract title')
1024 video_title = mobj.group(1).decode('utf-8')
1025 video_title = sanitize_title(video_title)
1027 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1029 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1031 video_uploader = mobj.group(1)
1034 # Process video information
1035 self._downloader.process_info({
1036 'id': video_id.decode('utf-8'),
1037 'url': video_url.decode('utf-8'),
1038 'uploader': video_uploader.decode('utf-8'),
1039 'title': video_title,
1040 'stitle': simple_title,
1041 'ext': video_extension.decode('utf-8'),
1044 except UnavailableFormatError:
1045 self._downloader.trouble(u'ERROR: format not available for video')
1048 class GoogleIE(InfoExtractor):
1049 """Information extractor for video.google.com."""
1051 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1053 def __init__(self, downloader=None):
1054 InfoExtractor.__init__(self, downloader)
1058 return (re.match(GoogleIE._VALID_URL, url) is not None)
1060 def report_download_webpage(self, video_id):
1061 """Report webpage download."""
1062 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1064 def report_extraction(self, video_id):
1065 """Report information extraction."""
1066 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1068 def _real_initialize(self):
1071 def _real_extract(self, url):
1072 # Extract id from URL
1073 mobj = re.match(self._VALID_URL, url)
1075 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1078 video_id = mobj.group(1)
1080 video_extension = 'mp4'
1082 # Retrieve video webpage to extract further information
1083 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1085 self.report_download_webpage(video_id)
1086 webpage = urllib2.urlopen(request).read()
1087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1088 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1091 # Extract URL, uploader, and title from webpage
1092 self.report_extraction(video_id)
1093 mobj = re.search(r"download_url:'([^']+)'", webpage)
1095 video_extension = 'flv'
1096 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1098 self._downloader.trouble(u'ERROR: unable to extract media URL')
1100 mediaURL = urllib.unquote(mobj.group(1))
1101 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1102 mediaURL = mediaURL.replace('\\x26', '\x26')
1104 video_url = mediaURL
1106 mobj = re.search(r'<title>(.*)</title>', webpage)
1108 self._downloader.trouble(u'ERROR: unable to extract title')
1110 video_title = mobj.group(1).decode('utf-8')
1111 video_title = sanitize_title(video_title)
1112 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1114 # Extract video description
1115 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1117 self._downloader.trouble(u'ERROR: unable to extract video description')
1119 video_description = mobj.group(1).decode('utf-8')
1120 if not video_description:
1121 video_description = 'No description available.'
1123 # Extract video thumbnail
1124 if self._downloader.params.get('forcethumbnail', False):
1125 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1127 webpage = urllib2.urlopen(request).read()
1128 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1131 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1133 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1135 video_thumbnail = mobj.group(1)
1136 else: # we need something to pass to process_info
1137 video_thumbnail = ''
1141 # Process video information
1142 self._downloader.process_info({
1143 'id': video_id.decode('utf-8'),
1144 'url': video_url.decode('utf-8'),
1146 'title': video_title,
1147 'stitle': simple_title,
1148 'ext': video_extension.decode('utf-8'),
1151 except UnavailableFormatError:
1152 self._downloader.trouble(u'ERROR: format not available for video')
1155 class PhotobucketIE(InfoExtractor):
1156 """Information extractor for photobucket.com."""
1158 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1160 def __init__(self, downloader=None):
1161 InfoExtractor.__init__(self, downloader)
1165 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1167 def report_download_webpage(self, video_id):
1168 """Report webpage download."""
1169 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1171 def report_extraction(self, video_id):
1172 """Report information extraction."""
1173 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1175 def _real_initialize(self):
1178 def _real_extract(self, url):
1179 # Extract id from URL
1180 mobj = re.match(self._VALID_URL, url)
1182 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1185 video_id = mobj.group(1)
1187 video_extension = 'flv'
1189 # Retrieve video webpage to extract further information
1190 request = urllib2.Request(url)
1192 self.report_download_webpage(video_id)
1193 webpage = urllib2.urlopen(request).read()
1194 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1198 # Extract URL, uploader, and title from webpage
1199 self.report_extraction(video_id)
1200 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1202 self._downloader.trouble(u'ERROR: unable to extract media URL')
1204 mediaURL = urllib.unquote(mobj.group(1))
1206 video_url = mediaURL
1208 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1210 self._downloader.trouble(u'ERROR: unable to extract title')
1212 video_title = mobj.group(1).decode('utf-8')
1213 video_title = sanitize_title(video_title)
1214 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1216 video_uploader = mobj.group(2).decode('utf-8')
1219 # Process video information
1220 self._downloader.process_info({
1221 'id': video_id.decode('utf-8'),
1222 'url': video_url.decode('utf-8'),
1223 'uploader': video_uploader,
1224 'title': video_title,
1225 'stitle': simple_title,
1226 'ext': video_extension.decode('utf-8'),
1229 except UnavailableFormatError:
1230 self._downloader.trouble(u'ERROR: format not available for video')
1233 class YahooIE(InfoExtractor):
1234 """Information extractor for video.yahoo.com."""
1236 # _VALID_URL matches all Yahoo! Video URLs
1237 # _VPAGE_URL matches only the extractable '/watch/' URLs
1238 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1239 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1241 def __init__(self, downloader=None):
1242 InfoExtractor.__init__(self, downloader)
1246 return (re.match(YahooIE._VALID_URL, url) is not None)
1248 def report_download_webpage(self, video_id):
1249 """Report webpage download."""
1250 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1252 def report_extraction(self, video_id):
1253 """Report information extraction."""
1254 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1256 def _real_initialize(self):
1259 def _real_extract(self, url):
1260 # Extract ID from URL
1261 mobj = re.match(self._VALID_URL, url)
1263 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1266 video_id = mobj.group(2)
1267 video_extension = 'flv'
1269 # Rewrite valid but non-extractable URLs as
1270 # extractable English language /watch/ URLs
1271 if re.match(self._VPAGE_URL, url) is None:
1272 request = urllib2.Request(url)
1274 webpage = urllib2.urlopen(request).read()
1275 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1276 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1279 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1281 self._downloader.trouble(u'ERROR: Unable to extract id field')
1283 yahoo_id = mobj.group(1)
1285 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1287 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1289 yahoo_vid = mobj.group(1)
1291 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1292 return self._real_extract(url)
1294 # Retrieve video webpage to extract further information
1295 request = urllib2.Request(url)
1297 self.report_download_webpage(video_id)
1298 webpage = urllib2.urlopen(request).read()
1299 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1303 # Extract uploader and title from webpage
1304 self.report_extraction(video_id)
1305 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1307 self._downloader.trouble(u'ERROR: unable to extract video title')
1309 video_title = mobj.group(1).decode('utf-8')
1310 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1312 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1314 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1316 video_uploader = mobj.group(1).decode('utf-8')
1318 # Extract video thumbnail
1319 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1321 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1323 video_thumbnail = mobj.group(1).decode('utf-8')
1325 # Extract video description
1326 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1328 self._downloader.trouble(u'ERROR: unable to extract video description')
1330 video_description = mobj.group(1).decode('utf-8')
1331 if not video_description: video_description = 'No description available.'
1333 # Extract video height and width
1334 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1336 self._downloader.trouble(u'ERROR: unable to extract video height')
1338 yv_video_height = mobj.group(1)
1340 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1342 self._downloader.trouble(u'ERROR: unable to extract video width')
1344 yv_video_width = mobj.group(1)
1346 # Retrieve video playlist to extract media URL
1347 # I'm not completely sure what all these options are, but we
1348 # seem to need most of them, otherwise the server sends a 401.
1349 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1350 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1351 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1352 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1353 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1355 self.report_download_webpage(video_id)
1356 webpage = urllib2.urlopen(request).read()
1357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1361 # Extract media URL from playlist XML
1362 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1364 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1366 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1367 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1370 # Process video information
1371 self._downloader.process_info({
1372 'id': video_id.decode('utf-8'),
1374 'uploader': video_uploader,
1375 'title': video_title,
1376 'stitle': simple_title,
1377 'ext': video_extension.decode('utf-8'),
1378 'thumbnail': video_thumbnail.decode('utf-8'),
1379 'description': video_description,
1380 'thumbnail': video_thumbnail,
1381 'description': video_description,
1383 except UnavailableFormatError:
1384 self._downloader.trouble(u'ERROR: format not available for video')
1387 class GenericIE(InfoExtractor):
1388 """Generic last-resort information extractor."""
1390 def __init__(self, downloader=None):
1391 InfoExtractor.__init__(self, downloader)
1397 def report_download_webpage(self, video_id):
1398 """Report webpage download."""
1399 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1400 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1402 def report_extraction(self, video_id):
1403 """Report information extraction."""
1404 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1406 def _real_initialize(self):
1409 def _real_extract(self, url):
1410 video_id = url.split('/')[-1]
1411 request = urllib2.Request(url)
1413 self.report_download_webpage(video_id)
1414 webpage = urllib2.urlopen(request).read()
1415 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1418 except ValueError, err:
1419 # since this is the last-resort InfoExtractor, if
1420 # this error is thrown, it'll be thrown here
1421 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1424 # Start with something easy: JW Player in SWFObject
1425 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1427 # Broaden the search a little bit
1428 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1430 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1433 # It's possible that one of the regexes
1434 # matched, but returned an empty group:
1435 if mobj.group(1) is None:
1436 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1439 video_url = urllib.unquote(mobj.group(1))
1440 video_id = os.path.basename(video_url)
1442 # here's a fun little line of code for you:
1443 video_extension = os.path.splitext(video_id)[1][1:]
1444 video_id = os.path.splitext(video_id)[0]
1446 # it's tempting to parse this further, but you would
1447 # have to take into account all the variations like
1448 # Video Title - Site Name
1449 # Site Name | Video Title
1450 # Video Title - Tagline | Site Name
1451 # and so on and so forth; it's just not practical
1452 mobj = re.search(r'<title>(.*)</title>', webpage)
1454 self._downloader.trouble(u'ERROR: unable to extract title')
1456 video_title = mobj.group(1).decode('utf-8')
1457 video_title = sanitize_title(video_title)
1458 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1460 # video uploader is domain name
1461 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1463 self._downloader.trouble(u'ERROR: unable to extract title')
1465 video_uploader = mobj.group(1).decode('utf-8')
1468 # Process video information
1469 self._downloader.process_info({
1470 'id': video_id.decode('utf-8'),
1471 'url': video_url.decode('utf-8'),
1472 'uploader': video_uploader,
1473 'title': video_title,
1474 'stitle': simple_title,
1475 'ext': video_extension.decode('utf-8'),
1478 except UnavailableFormatError:
1479 self._downloader.trouble(u'ERROR: format not available for video')
1482 class YoutubeSearchIE(InfoExtractor):
1483 """Information Extractor for YouTube search queries."""
1484 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1485 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1486 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1487 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1489 _max_youtube_results = 1000
1491 def __init__(self, youtube_ie, downloader=None):
1492 InfoExtractor.__init__(self, downloader)
1493 self._youtube_ie = youtube_ie
1497 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1499 def report_download_page(self, query, pagenum):
1500 """Report attempt to download playlist page with given number."""
1501 query = query.decode(preferredencoding())
1502 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1504 def _real_initialize(self):
1505 self._youtube_ie.initialize()
1507 def _real_extract(self, query):
1508 mobj = re.match(self._VALID_QUERY, query)
1510 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1513 prefix, query = query.split(':')
1515 query = query.encode('utf-8')
1517 self._download_n_results(query, 1)
1519 elif prefix == 'all':
1520 self._download_n_results(query, self._max_youtube_results)
1526 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1528 elif n > self._max_youtube_results:
1529 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1530 n = self._max_youtube_results
1531 self._download_n_results(query, n)
1533 except ValueError: # parsing prefix as integer fails
1534 self._download_n_results(query, 1)
1537 def _download_n_results(self, query, n):
1538 """Downloads a specified number of results for a query"""
1541 already_seen = set()
1545 self.report_download_page(query, pagenum)
1546 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1547 request = urllib2.Request(result_url, None, std_headers)
1549 page = urllib2.urlopen(request).read()
1550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1554 # Extract video identifiers
1555 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1557 if video_id not in already_seen:
1558 video_ids.append(video_id)
1559 already_seen.add(video_id)
1560 if len(video_ids) == n:
1561 # Specified n videos reached
1562 for id in video_ids:
1563 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1566 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567 for id in video_ids:
1568 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1571 pagenum = pagenum + 1
1573 class GoogleSearchIE(InfoExtractor):
1574 """Information Extractor for Google Video search queries."""
1575 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1576 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1578 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1580 _max_google_results = 1000
1582 def __init__(self, google_ie, downloader=None):
1583 InfoExtractor.__init__(self, downloader)
1584 self._google_ie = google_ie
1588 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1590 def report_download_page(self, query, pagenum):
1591 """Report attempt to download playlist page with given number."""
1592 query = query.decode(preferredencoding())
1593 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1595 def _real_initialize(self):
1596 self._google_ie.initialize()
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_QUERY, query)
1601 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1604 prefix, query = query.split(':')
1606 query = query.encode('utf-8')
1608 self._download_n_results(query, 1)
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_google_results)
1617 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619 elif n > self._max_google_results:
1620 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1621 n = self._max_google_results
1622 self._download_n_results(query, n)
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1632 already_seen = set()
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1638 request = urllib2.Request(result_url, None, std_headers)
1640 page = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1662 pagenum = pagenum + 1
1664 class YahooSearchIE(InfoExtractor):
1665 """Information Extractor for Yahoo! Video search queries."""
1666 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1667 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1668 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1669 _MORE_PAGES_INDICATOR = r'\s*Next'
1671 _max_yahoo_results = 1000
1673 def __init__(self, yahoo_ie, downloader=None):
1674 InfoExtractor.__init__(self, downloader)
1675 self._yahoo_ie = yahoo_ie
1679 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1681 def report_download_page(self, query, pagenum):
1682 """Report attempt to download playlist page with given number."""
1683 query = query.decode(preferredencoding())
1684 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1686 def _real_initialize(self):
1687 self._yahoo_ie.initialize()
1689 def _real_extract(self, query):
1690 mobj = re.match(self._VALID_QUERY, query)
1692 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1695 prefix, query = query.split(':')
1697 query = query.encode('utf-8')
1699 self._download_n_results(query, 1)
1701 elif prefix == 'all':
1702 self._download_n_results(query, self._max_yahoo_results)
1708 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1710 elif n > self._max_yahoo_results:
1711 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1712 n = self._max_yahoo_results
1713 self._download_n_results(query, n)
1715 except ValueError: # parsing prefix as integer fails
1716 self._download_n_results(query, 1)
1719 def _download_n_results(self, query, n):
1720 """Downloads a specified number of results for a query"""
1723 already_seen = set()
1727 self.report_download_page(query, pagenum)
1728 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1729 request = urllib2.Request(result_url, None, std_headers)
1731 page = urllib2.urlopen(request).read()
1732 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1733 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1736 # Extract video identifiers
1737 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1738 video_id = mobj.group(1)
1739 if video_id not in already_seen:
1740 video_ids.append(video_id)
1741 already_seen.add(video_id)
1742 if len(video_ids) == n:
1743 # Specified n videos reached
1744 for id in video_ids:
1745 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1748 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1749 for id in video_ids:
1750 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1753 pagenum = pagenum + 1
1755 class YoutubePlaylistIE(InfoExtractor):
1756 """Information Extractor for YouTube playlists."""
1758 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1759 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1760 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1761 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1764 def __init__(self, youtube_ie, downloader=None):
1765 InfoExtractor.__init__(self, downloader)
1766 self._youtube_ie = youtube_ie
1770 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1772 def report_download_page(self, playlist_id, pagenum):
1773 """Report attempt to download playlist page with given number."""
1774 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1776 def _real_initialize(self):
1777 self._youtube_ie.initialize()
1779 def _real_extract(self, url):
1780 # Extract playlist id
1781 mobj = re.match(self._VALID_URL, url)
1783 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1786 # Download playlist pages
1787 playlist_id = mobj.group(1)
1792 self.report_download_page(playlist_id, pagenum)
1793 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1795 page = urllib2.urlopen(request).read()
1796 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1800 # Extract video identifiers
1802 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1803 if mobj.group(1) not in ids_in_page:
1804 ids_in_page.append(mobj.group(1))
1805 video_ids.extend(ids_in_page)
1807 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1809 pagenum = pagenum + 1
1811 for id in video_ids:
1812 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1815 class YoutubeUserIE(InfoExtractor):
1816 """Information Extractor for YouTube users."""
1818 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1819 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1820 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1823 def __init__(self, youtube_ie, downloader=None):
1824 InfoExtractor.__init__(self, downloader)
1825 self._youtube_ie = youtube_ie
1829 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1831 def report_download_page(self, username):
1832 """Report attempt to download user page."""
1833 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1835 def _real_initialize(self):
1836 self._youtube_ie.initialize()
1838 def _real_extract(self, url):
1840 mobj = re.match(self._VALID_URL, url)
1842 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1845 # Download user page
1846 username = mobj.group(1)
1850 self.report_download_page(username)
1851 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1853 page = urllib2.urlopen(request).read()
1854 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1855 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1858 # Extract video identifiers
1861 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862 if mobj.group(1) not in ids_in_page:
1863 ids_in_page.append(mobj.group(1))
1864 video_ids.extend(ids_in_page)
1866 for id in video_ids:
1867 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1870 class PostProcessor(object):
1871 """Post Processor class.
1873 PostProcessor objects can be added to downloaders with their
1874 add_post_processor() method. When the downloader has finished a
1875 successful download, it will take its internal chain of PostProcessors
1876 and start calling the run() method on each one of them, first with
1877 an initial argument and then with the returned value of the previous
1880 The chain will be stopped if one of them ever returns None or the end
1881 of the chain is reached.
1883 PostProcessor objects follow a "mutual registration" process similar
1884 to InfoExtractor objects.
1889 def __init__(self, downloader=None):
1890 self._downloader = downloader
1892 def set_downloader(self, downloader):
1893 """Sets the downloader for this PP."""
1894 self._downloader = downloader
1896 def run(self, information):
1897 """Run the PostProcessor.
1899 The "information" argument is a dictionary like the ones
1900 composed by InfoExtractors. The only difference is that this
1901 one has an extra field called "filepath" that points to the
1904 When this method returns None, the postprocessing chain is
1905 stopped. However, this method may return an information
1906 dictionary that will be passed to the next postprocessing
1907 object in the chain. It can be the one it received after
1908 changing some fields.
1910 In addition, this method may raise a PostProcessingError
1911 exception that will be taken into account by the downloader
1914 return information # by default, do nothing
1916 ### MAIN PROGRAM ###
1917 if __name__ == '__main__':
1919 # Modules needed only when running the main program
1923 # Function to update the program file with the latest version from bitbucket.org
1924 def update_self(downloader, filename):
1925 # Note: downloader only used for options
1926 if not os.access (filename, os.W_OK):
1927 sys.exit('ERROR: no write permissions on %s' % filename)
1929 downloader.to_stdout('Updating to latest stable version...')
1930 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1931 latest_version = urllib.urlopen(latest_url).read().strip()
1932 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1933 newcontent = urllib.urlopen(prog_url).read()
1934 stream = open(filename, 'w')
1935 stream.write(newcontent)
1937 downloader.to_stdout('Updated to version %s' % latest_version)
1939 # General configuration
1940 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1941 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1942 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1944 # Parse command line
1945 parser = optparse.OptionParser(
1946 usage='Usage: %prog [options] url...',
1947 version='2010.04.04',
1948 conflict_handler='resolve',
1951 parser.add_option('-h', '--help',
1952 action='help', help='print this help text and exit')
1953 parser.add_option('-v', '--version',
1954 action='version', help='print program version and exit')
1955 parser.add_option('-U', '--update',
1956 action='store_true', dest='update_self', help='update this program to latest stable version')
1957 parser.add_option('-i', '--ignore-errors',
1958 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1959 parser.add_option('-r', '--rate-limit',
1960 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1962 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1963 authentication.add_option('-u', '--username',
1964 dest='username', metavar='UN', help='account username')
1965 authentication.add_option('-p', '--password',
1966 dest='password', metavar='PW', help='account password')
1967 authentication.add_option('-n', '--netrc',
1968 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1969 parser.add_option_group(authentication)
1971 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1972 video_format.add_option('-f', '--format',
1973 action='store', dest='format', metavar='FMT', help='video format code')
1974 video_format.add_option('-b', '--best-quality',
1975 action='store_const', dest='format', help='download the best quality video possible', const='0')
1976 video_format.add_option('-m', '--mobile-version',
1977 action='store_const', dest='format', help='alias for -f 17', const='17')
1978 video_format.add_option('-d', '--high-def',
1979 action='store_const', dest='format', help='alias for -f 22', const='22')
1980 video_format.add_option('--all-formats',
1981 action='store_const', dest='format', help='download all available video formats', const='-1')
1982 parser.add_option_group(video_format)
1984 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1985 verbosity.add_option('-q', '--quiet',
1986 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1987 verbosity.add_option('-s', '--simulate',
1988 action='store_true', dest='simulate', help='do not download video', default=False)
1989 verbosity.add_option('-g', '--get-url',
1990 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1991 verbosity.add_option('-e', '--get-title',
1992 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1993 verbosity.add_option('--get-thumbnail',
1994 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1995 verbosity.add_option('--get-description',
1996 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
1997 verbosity.add_option('--no-progress',
1998 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1999 parser.add_option_group(verbosity)
2001 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2002 filesystem.add_option('-t', '--title',
2003 action='store_true', dest='usetitle', help='use title in file name', default=False)
2004 filesystem.add_option('-l', '--literal',
2005 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2006 filesystem.add_option('-o', '--output',
2007 dest='outtmpl', metavar='TPL', help='output filename template')
2008 filesystem.add_option('-a', '--batch-file',
2009 dest='batchfile', metavar='F', help='file containing URLs to download')
2010 filesystem.add_option('-w', '--no-overwrites',
2011 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2012 filesystem.add_option('-c', '--continue',
2013 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2014 parser.add_option_group(filesystem)
2016 (opts, args) = parser.parse_args()
2018 # Batch file verification
2020 if opts.batchfile is not None:
2022 batchurls = open(opts.batchfile, 'r').readlines()
2023 batchurls = [x.strip() for x in batchurls]
2024 batchurls = [x for x in batchurls if len(x) > 0]
2026 sys.exit(u'ERROR: batch file could not be read')
2027 all_urls = batchurls + args
2029 # Conflicting, missing and erroneous options
2030 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2031 parser.error(u'using .netrc conflicts with giving username/password')
2032 if opts.password is not None and opts.username is None:
2033 parser.error(u'account username missing')
2034 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2035 parser.error(u'using output template conflicts with using title or literal title')
2036 if opts.usetitle and opts.useliteral:
2037 parser.error(u'using title conflicts with using literal title')
2038 if opts.username is not None and opts.password is None:
2039 opts.password = getpass.getpass(u'Type account password and press return:')
2040 if opts.ratelimit is not None:
2041 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2042 if numeric_limit is None:
2043 parser.error(u'invalid rate limit specified')
2044 opts.ratelimit = numeric_limit
2046 # Information extractors
2047 youtube_ie = YoutubeIE()
2048 metacafe_ie = MetacafeIE(youtube_ie)
2049 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2050 youtube_user_ie = YoutubeUserIE(youtube_ie)
2051 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2052 google_ie = GoogleIE()
2053 google_search_ie = GoogleSearchIE(google_ie)
2054 photobucket_ie = PhotobucketIE()
2055 yahoo_ie = YahooIE()
2056 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2057 generic_ie = GenericIE()
2060 fd = FileDownloader({
2061 'usenetrc': opts.usenetrc,
2062 'username': opts.username,
2063 'password': opts.password,
2064 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2065 'forceurl': opts.geturl,
2066 'forcetitle': opts.gettitle,
2067 'forcethumbnail': opts.getthumbnail,
2068 'forcedescription': opts.getdescription,
2069 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2070 'format': opts.format,
2071 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2072 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2073 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2074 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2075 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2076 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2077 or u'%(id)s.%(ext)s'),
2078 'ignoreerrors': opts.ignoreerrors,
2079 'ratelimit': opts.ratelimit,
2080 'nooverwrites': opts.nooverwrites,
2081 'continuedl': opts.continue_dl,
2082 'noprogress': opts.noprogress,
2084 fd.add_info_extractor(youtube_search_ie)
2085 fd.add_info_extractor(youtube_pl_ie)
2086 fd.add_info_extractor(youtube_user_ie)
2087 fd.add_info_extractor(metacafe_ie)
2088 fd.add_info_extractor(youtube_ie)
2089 fd.add_info_extractor(google_ie)
2090 fd.add_info_extractor(google_search_ie)
2091 fd.add_info_extractor(photobucket_ie)
2092 fd.add_info_extractor(yahoo_ie)
2093 fd.add_info_extractor(yahoo_search_ie)
2095 # This must come last since it's the
2096 # fallback if none of the others work
2097 fd.add_info_extractor(generic_ie)
2100 if opts.update_self:
2101 update_self(fd, sys.argv[0])
2104 if len(all_urls) < 1:
2105 if not opts.update_self:
2106 parser.error(u'you must provide at least one URL')
2109 retcode = fd.download(all_urls)
2112 except DownloadError:
2114 except SameFileError:
2115 sys.exit(u'ERROR: fixed output name but more than one file to download')
2116 except KeyboardInterrupt:
2117 sys.exit(u'\nERROR: Interrupted by user')