2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
26 # parse_qs was moved from the cgi module to the urlparse module recently.
28 from urlparse import parse_qs
30 from cgi import parse_qs
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Language': 'en-us,en;q=0.5',
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41 def preferredencoding():
42 """Get preferred encoding.
44 Returns the best encoding scheme for the system, based on
45 locale.getpreferredencoding() and some further tweaks.
47 def yield_preferredencoding():
49 pref = locale.getpreferredencoding()
55 return yield_preferredencoding().next()
57 def htmlentity_transform(matchobj):
58 """Transforms an HTML entity to a Unicode character.
60 This function receives a match object and is intended to be used with
61 the re.sub() function.
63 entity = matchobj.group(1)
65 # Known non-numeric HTML entity
66 if entity in htmlentitydefs.name2codepoint:
67 return unichr(htmlentitydefs.name2codepoint[entity])
70 mobj = re.match(ur'(?u)#(x?\d+)', entity)
72 numstr = mobj.group(1)
73 if numstr.startswith(u'x'):
75 numstr = u'0%s' % numstr
78 return unichr(long(numstr, base))
80 # Unknown entity in name, return its literal representation
81 return (u'&%s;' % entity)
83 def sanitize_title(utitle):
84 """Sanitizes a video title so it could be used as part of a filename."""
85 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86 return utitle.replace(unicode(os.sep), u'%')
88 def sanitize_open(filename, open_mode):
89 """Try to open the given filename, and slightly tweak it if this fails.
91 Attempts to open the given filename. If this fails, it tries to change
92 the filename slightly, step by step, until it's either able to open it
93 or it fails and raises a final exception, like the standard open()
96 It returns the tuple (stream, definitive_file_name).
100 if sys.platform == 'win32':
102 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103 return (sys.stdout, filename)
104 stream = open(filename, open_mode)
105 return (stream, filename)
106 except (IOError, OSError), err:
107 # In case of error, try to remove win32 forbidden chars
108 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110 # An exception here should be caught in the caller
111 stream = open(filename, open_mode)
112 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def temp_name(filename):
240 """Returns a temporary filename for the given filename."""
241 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
243 return filename + u'.part'
246 def format_bytes(bytes):
249 if type(bytes) is str:
254 exponent = long(math.log(bytes, 1024.0))
255 suffix = 'bkMGTPEZY'[exponent]
256 converted = float(bytes) / float(1024**exponent)
257 return '%.2f%s' % (converted, suffix)
260 def calc_percent(byte_counter, data_len):
263 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
266 def calc_eta(start, now, total, current):
270 if current == 0 or dif < 0.001: # One millisecond
272 rate = float(current) / dif
273 eta = long((float(total) - float(current)) / rate)
274 (eta_mins, eta_secs) = divmod(eta, 60)
277 return '%02d:%02d' % (eta_mins, eta_secs)
280 def calc_speed(start, now, bytes):
282 if bytes == 0 or dif < 0.001: # One millisecond
283 return '%10s' % '---b/s'
284 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
287 def best_block_size(elapsed_time, bytes):
288 new_min = max(bytes / 2.0, 1.0)
289 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290 if elapsed_time < 0.001:
292 rate = bytes / elapsed_time
300 def parse_bytes(bytestr):
301 """Parse a string indicating a byte quantity into a long integer."""
302 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
305 number = float(matchobj.group(1))
306 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307 return long(round(number * multiplier))
309 def add_info_extractor(self, ie):
310 """Add an InfoExtractor object to the end of the list."""
312 ie.set_downloader(self)
314 def add_post_processor(self, pp):
315 """Add a PostProcessor object to the end of the chain."""
317 pp.set_downloader(self)
319 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320 """Print message to stdout if not in quiet mode."""
322 if not self.params.get('quiet', False):
323 terminator = [u'\n', u''][skip_eol]
324 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325 self._screen_file.flush()
326 except (UnicodeEncodeError), err:
327 if not ignore_encoding_errors:
330 def to_stderr(self, message):
331 """Print message to stderr."""
332 print >>sys.stderr, message.encode(preferredencoding())
334 def fixed_template(self):
335 """Checks if the output template is fixed."""
336 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
338 def trouble(self, message=None):
339 """Determine action to take when a download problem appears.
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
345 if message is not None:
346 self.to_stderr(message)
347 if not self.params.get('ignoreerrors', False):
348 raise DownloadError(message)
349 self._download_retcode = 1
351 def slow_down(self, start_time, byte_counter):
352 """Sleep if the download speed is over the rate limit."""
353 rate_limit = self.params.get('ratelimit', None)
354 if rate_limit is None or byte_counter == 0:
357 elapsed = now - start_time
360 speed = float(byte_counter) / elapsed
361 if speed > rate_limit:
362 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
364 def try_rename(self, old_filename, new_filename):
366 if old_filename == new_filename:
368 os.rename(old_filename, new_filename)
369 except (IOError, OSError), err:
370 self.trouble(u'ERROR: unable to rename file')
372 def report_destination(self, filename):
373 """Report destination filename."""
374 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
376 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377 """Report download progress."""
378 if self.params.get('noprogress', False):
380 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
383 def report_resuming_byte(self, resume_len):
384 """Report attempt to resume at given byte."""
385 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
387 def report_retry(self, count, retries):
388 """Report retry in case of HTTP error 5xx"""
389 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
391 def report_file_already_downloaded(self, file_name):
392 """Report file has already been fully downloaded."""
394 self.to_screen(u'[download] %s has already been downloaded' % file_name)
395 except (UnicodeEncodeError), err:
396 self.to_screen(u'[download] The file has already been downloaded')
398 def report_unable_to_resume(self):
399 """Report it was impossible to resume download."""
400 self.to_screen(u'[download] Unable to resume')
402 def report_finish(self):
403 """Report download finished."""
404 if self.params.get('noprogress', False):
405 self.to_screen(u'[download] Download completed')
409 def increment_downloads(self):
410 """Increment the ordinal that assigns a number to each file."""
411 self._num_downloads += 1
413 def process_info(self, info_dict):
414 """Process a single dictionary returned by an InfoExtractor."""
415 # Do nothing else if in simulate mode
416 if self.params.get('simulate', False):
418 if self.params.get('forcetitle', False):
419 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self.params.get('forceurl', False):
421 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424 if self.params.get('forcedescription', False) and 'description' in info_dict:
425 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
430 template_dict = dict(info_dict)
431 template_dict['epoch'] = unicode(long(time.time()))
432 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433 filename = self.params['outtmpl'] % template_dict
434 except (ValueError, KeyError), err:
435 self.trouble(u'ERROR: invalid system charset or erroneous output template')
437 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438 self.to_stderr(u'WARNING: file exists and will be skipped')
442 self.pmkdir(filename)
443 except (OSError, IOError), err:
444 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
448 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449 except (OSError, IOError), err:
450 raise UnavailableVideoError
451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
454 except (ContentTooShortError, ), err:
455 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
460 self.post_process(filename, info_dict)
461 except (PostProcessingError), err:
462 self.trouble(u'ERROR: postprocessing: %s' % str(err))
465 def download(self, url_list):
466 """Download a given list of URLs."""
467 if len(url_list) > 1 and self.fixed_template():
468 raise SameFileError(self.params['outtmpl'])
471 suitable_found = False
473 # Go to next InfoExtractor if not suitable
474 if not ie.suitable(url):
477 # Suitable InfoExtractor found
478 suitable_found = True
480 # Extract information from URL and process it
483 # Suitable InfoExtractor had been found; go to next URL
486 if not suitable_found:
487 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
489 return self._download_retcode
491 def post_process(self, filename, ie_info):
492 """Run the postprocessing chain on the given file."""
494 info['filepath'] = filename
500 def _download_with_rtmpdump(self, filename, url, player_url):
501 self.report_destination(filename)
502 tmpfilename = self.temp_name(filename)
504 # Check for rtmpdump first
506 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507 except (OSError, IOError):
508 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
511 # Download using rtmpdump. rtmpdump returns exit code 2 when
512 # the connection was interrumpted and resuming appears to be
513 # possible. This is part of rtmpdump's normal usage, AFAIK.
514 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516 while retval == 2 or retval == 1:
517 prevsize = os.path.getsize(tmpfilename)
518 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519 time.sleep(5.0) # This seems to be needed
520 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521 cursize = os.path.getsize(tmpfilename)
522 if prevsize == cursize and retval == 1:
525 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526 self.try_rename(tmpfilename, filename)
529 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
532 def _do_download(self, filename, url, player_url):
533 # Check file already present
534 if self.params.get('continuedl', False) and os.path.isfile(filename):
535 self.report_file_already_downloaded(filename)
538 # Attempt to download using rtmpdump
539 if url.startswith('rtmp'):
540 return self._download_with_rtmpdump(filename, url, player_url)
542 tmpfilename = self.temp_name(filename)
545 basic_request = urllib2.Request(url, None, std_headers)
546 request = urllib2.Request(url, None, std_headers)
548 # Establish possible resume length
549 if os.path.isfile(tmpfilename):
550 resume_len = os.path.getsize(tmpfilename)
554 # Request parameters in case of being able to resume
555 if self.params.get('continuedl', False) and resume_len != 0:
556 self.report_resuming_byte(resume_len)
557 request.add_header('Range','bytes=%d-' % resume_len)
561 retries = self.params.get('retries', 0)
562 while count <= retries:
563 # Establish connection
565 data = urllib2.urlopen(request)
567 except (urllib2.HTTPError, ), err:
568 if (err.code < 500 or err.code >= 600) and err.code != 416:
569 # Unexpected HTTP error
571 elif err.code == 416:
572 # Unable to resume (requested range not satisfiable)
574 # Open the connection again without the range header
575 data = urllib2.urlopen(basic_request)
576 content_length = data.info()['Content-Length']
577 except (urllib2.HTTPError, ), err:
578 if err.code < 500 or err.code >= 600:
581 # Examine the reported length
582 if (content_length is not None and
583 (resume_len - 100 < long(content_length) < resume_len + 100)):
584 # The file had already been fully downloaded.
585 # Explanation to the above condition: in issue #175 it was revealed that
586 # YouTube sometimes adds or removes a few bytes from the end of the file,
587 # changing the file size slightly and causing problems for some users. So
588 # I decided to implement a suggested change and consider the file
589 # completely downloaded if the file size differs less than 100 bytes from
590 # the one in the hard drive.
591 self.report_file_already_downloaded(filename)
592 self.try_rename(tmpfilename, filename)
595 # The length does not match, we start the download over
596 self.report_unable_to_resume()
602 self.report_retry(count, retries)
605 self.trouble(u'ERROR: giving up after %s retries' % retries)
608 data_len = data.info().get('Content-length', None)
609 if data_len is not None:
610 data_len = long(data_len) + resume_len
611 data_len_str = self.format_bytes(data_len)
612 byte_counter = 0 + resume_len
618 data_block = data.read(block_size)
620 data_block_len = len(data_block)
621 if data_block_len == 0:
623 byte_counter += data_block_len
625 # Open file just in time
628 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
629 self.report_destination(filename)
630 except (OSError, IOError), err:
631 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
634 stream.write(data_block)
635 except (IOError, OSError), err:
636 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
638 block_size = self.best_block_size(after - before, data_block_len)
641 percent_str = self.calc_percent(byte_counter, data_len)
642 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
643 speed_str = self.calc_speed(start, time.time(), byte_counter)
644 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
647 self.slow_down(start, byte_counter)
651 if data_len is not None and str(byte_counter) != data_len:
652 raise ContentTooShortError(byte_counter, long(data_len))
653 self.try_rename(tmpfilename, filename)
656 class InfoExtractor(object):
657 """Information Extractor class.
659 Information extractors are the classes that, given a URL, extract
660 information from the video (or videos) the URL refers to. This
661 information includes the real video URL, the video title and simplified
662 title, author and others. The information is stored in a dictionary
663 which is then passed to the FileDownloader. The FileDownloader
664 processes this information possibly downloading the video to the file
665 system, among other possible outcomes. The dictionaries must include
666 the following fields:
668 id: Video identifier.
669 url: Final video URL.
670 uploader: Nickname of the video uploader.
671 title: Literal title.
672 stitle: Simplified title.
673 ext: Video filename extension.
674 format: Video format.
675 player_url: SWF Player URL (may be None).
677 The following fields are optional. Their primary purpose is to allow
678 youtube-dl to serve as the backend for a video search function, such
679 as the one in youtube2mp3. They are only used when their respective
680 forced printing functions are called:
682 thumbnail: Full URL to a video thumbnail image.
683 description: One-line video description.
685 Subclasses of this one should re-define the _real_initialize() and
686 _real_extract() methods, as well as the suitable() static method.
687 Probably, they should also be instantiated and added to the main
694 def __init__(self, downloader=None):
695 """Constructor. Receives an optional downloader."""
697 self.set_downloader(downloader)
701 """Receives a URL and returns True if suitable for this IE."""
704 def initialize(self):
705 """Initializes an instance (authentication, etc)."""
707 self._real_initialize()
710 def extract(self, url):
711 """Extracts URL information and returns it in list of dicts."""
713 return self._real_extract(url)
715 def set_downloader(self, downloader):
716 """Sets the downloader for this IE."""
717 self._downloader = downloader
719 def _real_initialize(self):
720 """Real initialization process. Redefine in subclasses."""
723 def _real_extract(self, url):
724 """Real extraction process. Redefine in subclasses."""
727 class YoutubeIE(InfoExtractor):
728 """Information extractor for youtube.com."""
730 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
731 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
732 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
733 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
734 _NETRC_MACHINE = 'youtube'
735 # Listed in order of quality
736 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
737 _video_extensions = {
743 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
750 return (re.match(YoutubeIE._VALID_URL, url) is not None)
752 def report_lang(self):
753 """Report attempt to set language."""
754 self._downloader.to_screen(u'[youtube] Setting language')
756 def report_login(self):
757 """Report attempt to log in."""
758 self._downloader.to_screen(u'[youtube] Logging in')
760 def report_age_confirmation(self):
761 """Report attempt to confirm age."""
762 self._downloader.to_screen(u'[youtube] Confirming age')
764 def report_video_webpage_download(self, video_id):
765 """Report attempt to download video webpage."""
766 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
768 def report_video_info_webpage_download(self, video_id):
769 """Report attempt to download video info webpage."""
770 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
772 def report_information_extraction(self, video_id):
773 """Report attempt to extract video information."""
774 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
776 def report_unavailable_format(self, video_id, format):
777 """Report extracted video URL."""
778 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
780 def report_rtmp_download(self):
781 """Indicate the download will use the RTMP protocol."""
782 self._downloader.to_screen(u'[youtube] RTMP download detected')
784 def _real_initialize(self):
785 if self._downloader is None:
790 downloader_params = self._downloader.params
792 # Attempt to use provided username and password or .netrc data
793 if downloader_params.get('username', None) is not None:
794 username = downloader_params['username']
795 password = downloader_params['password']
796 elif downloader_params.get('usenetrc', False):
798 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
803 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
804 except (IOError, netrc.NetrcParseError), err:
805 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
809 request = urllib2.Request(self._LANG_URL, None, std_headers)
812 urllib2.urlopen(request).read()
813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
814 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
817 # No authentication to be performed
823 'current_form': 'loginForm',
825 'action_login': 'Log In',
826 'username': username,
827 'password': password,
829 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
832 login_results = urllib2.urlopen(request).read()
833 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
834 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
843 'action_confirm': 'Confirm',
845 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
847 self.report_age_confirmation()
848 age_results = urllib2.urlopen(request).read()
849 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
853 def _real_extract(self, url):
854 # Extract video id from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
859 video_id = mobj.group(2)
862 self.report_video_webpage_download(video_id)
863 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
865 video_webpage = urllib2.urlopen(request).read()
866 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
870 # Attempt to extract SWF player URL
871 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
873 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
878 self.report_video_info_webpage_download(video_id)
879 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
880 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
881 % (video_id, el_type))
882 request = urllib2.Request(video_info_url, None, std_headers)
884 video_info_webpage = urllib2.urlopen(request).read()
885 video_info = parse_qs(video_info_webpage)
886 if 'token' in video_info:
888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
891 if 'token' not in video_info:
892 if 'reason' in video_info:
893 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
895 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
898 # Start extracting information
899 self.report_information_extraction(video_id)
902 if 'author' not in video_info:
903 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
905 video_uploader = urllib.unquote_plus(video_info['author'][0])
908 if 'title' not in video_info:
909 self._downloader.trouble(u'ERROR: unable to extract video title')
911 video_title = urllib.unquote_plus(video_info['title'][0])
912 video_title = video_title.decode('utf-8')
913 video_title = sanitize_title(video_title)
916 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
917 simple_title = simple_title.strip(ur'_')
920 if 'thumbnail_url' not in video_info:
921 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
923 else: # don't panic if we can't find it
924 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
928 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
930 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
931 format_expressions = ['%d %B %Y', '%B %d %Y']
932 for expression in format_expressions:
934 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
939 video_description = 'No description available.'
940 if self._downloader.params.get('forcedescription', False):
941 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
943 video_description = mobj.group(1)
946 video_token = urllib.unquote_plus(video_info['token'][0])
948 # Decide which formats to download
949 req_format = self._downloader.params.get('format', None)
950 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
952 if 'fmt_url_map' in video_info:
953 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
954 format_limit = self._downloader.params.get('format_limit', None)
955 if format_limit is not None and format_limit in self._available_formats:
956 format_list = self._available_formats[self._available_formats.index(format_limit):]
958 format_list = self._available_formats
959 existing_formats = [x for x in format_list if x in url_map]
960 if len(existing_formats) == 0:
961 self._downloader.trouble(u'ERROR: no known formats available for video')
963 if req_format is None:
964 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
965 elif req_format == '-1':
966 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
968 if req_format in url_map:
969 video_url_list = [(req_format, url_map[req_format])] # Specific format
971 video_url_list = [(req_format, get_video_template % req_format)] # Specific format
973 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
974 self.report_rtmp_download()
975 video_url_list = [(None, video_info['conn'][0])]
978 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
981 for format_param, video_real_url in video_url_list:
982 # At this point we have a new video
983 self._downloader.increment_downloads()
986 video_extension = self._video_extensions.get(format_param, 'flv')
988 # Find the video URL in fmt_url_map or conn paramters
990 # Process video information
991 self._downloader.process_info({
992 'id': video_id.decode('utf-8'),
993 'url': video_real_url.decode('utf-8'),
994 'uploader': video_uploader.decode('utf-8'),
995 'upload_date': upload_date,
996 'title': video_title,
997 'stitle': simple_title,
998 'ext': video_extension.decode('utf-8'),
999 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1000 'thumbnail': video_thumbnail.decode('utf-8'),
1001 'description': video_description.decode('utf-8'),
1002 'player_url': player_url,
1004 except UnavailableVideoError, err:
1005 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
1008 class MetacafeIE(InfoExtractor):
1009 """Information Extractor for metacafe.com."""
1011 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1012 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1013 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1016 def __init__(self, youtube_ie, downloader=None):
1017 InfoExtractor.__init__(self, downloader)
1018 self._youtube_ie = youtube_ie
1022 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1024 def report_disclaimer(self):
1025 """Report disclaimer retrieval."""
1026 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1028 def report_age_confirmation(self):
1029 """Report attempt to confirm age."""
1030 self._downloader.to_screen(u'[metacafe] Confirming age')
1032 def report_download_webpage(self, video_id):
1033 """Report webpage download."""
1034 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1036 def report_extraction(self, video_id):
1037 """Report information extraction."""
1038 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1040 def _real_initialize(self):
1041 # Retrieve disclaimer
1042 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1044 self.report_disclaimer()
1045 disclaimer = urllib2.urlopen(request).read()
1046 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1053 'submit': "Continue - I'm over 18",
1055 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1057 self.report_age_confirmation()
1058 disclaimer = urllib2.urlopen(request).read()
1059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1060 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1063 def _real_extract(self, url):
1064 # Extract id and simplified title from URL
1065 mobj = re.match(self._VALID_URL, url)
1067 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1070 video_id = mobj.group(1)
1072 # Check if video comes from YouTube
1073 mobj2 = re.match(r'^yt-(.*)$', video_id)
1074 if mobj2 is not None:
1075 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1078 # At this point we have a new video
1079 self._downloader.increment_downloads()
1081 simple_title = mobj.group(2).decode('utf-8')
1083 # Retrieve video webpage to extract further information
1084 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1086 self.report_download_webpage(video_id)
1087 webpage = urllib2.urlopen(request).read()
1088 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1089 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1092 # Extract URL, uploader and title from webpage
1093 self.report_extraction(video_id)
1094 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1095 if mobj is not None:
1096 mediaURL = urllib.unquote(mobj.group(1))
1097 video_extension = mediaURL[-3:]
1099 # Extract gdaKey if available
1100 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1102 video_url = mediaURL
1104 gdaKey = mobj.group(1)
1105 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1107 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1109 self._downloader.trouble(u'ERROR: unable to extract media URL')
1111 vardict = parse_qs(mobj.group(1))
1112 if 'mediaData' not in vardict:
1113 self._downloader.trouble(u'ERROR: unable to extract media URL')
1115 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1117 self._downloader.trouble(u'ERROR: unable to extract media URL')
1119 mediaURL = mobj.group(1).replace('\\/', '/')
1120 video_extension = mediaURL[-3:]
1121 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1123 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1125 self._downloader.trouble(u'ERROR: unable to extract title')
1127 video_title = mobj.group(1).decode('utf-8')
1128 video_title = sanitize_title(video_title)
1130 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1132 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1134 video_uploader = mobj.group(1)
1137 # Process video information
1138 self._downloader.process_info({
1139 'id': video_id.decode('utf-8'),
1140 'url': video_url.decode('utf-8'),
1141 'uploader': video_uploader.decode('utf-8'),
1142 'upload_date': u'NA',
1143 'title': video_title,
1144 'stitle': simple_title,
1145 'ext': video_extension.decode('utf-8'),
1149 except UnavailableVideoError:
1150 self._downloader.trouble(u'ERROR: unable to download video')
1153 class DailymotionIE(InfoExtractor):
1154 """Information Extractor for Dailymotion"""
1156 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1158 def __init__(self, downloader=None):
1159 InfoExtractor.__init__(self, downloader)
1163 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1165 def report_download_webpage(self, video_id):
1166 """Report webpage download."""
1167 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1169 def report_extraction(self, video_id):
1170 """Report information extraction."""
1171 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1173 def _real_initialize(self):
1176 def _real_extract(self, url):
1177 # Extract id and simplified title from URL
1178 mobj = re.match(self._VALID_URL, url)
1180 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1183 # At this point we have a new video
1184 self._downloader.increment_downloads()
1185 video_id = mobj.group(1)
1187 simple_title = mobj.group(2).decode('utf-8')
1188 video_extension = 'flv'
1190 # Retrieve video webpage to extract further information
1191 request = urllib2.Request(url)
1193 self.report_download_webpage(video_id)
1194 webpage = urllib2.urlopen(request).read()
1195 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1199 # Extract URL, uploader and title from webpage
1200 self.report_extraction(video_id)
1201 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1203 self._downloader.trouble(u'ERROR: unable to extract media URL')
1205 mediaURL = urllib.unquote(mobj.group(1))
1207 # if needed add http://www.dailymotion.com/ if relative URL
1209 video_url = mediaURL
1211 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1212 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1214 self._downloader.trouble(u'ERROR: unable to extract title')
1216 video_title = mobj.group(1).decode('utf-8')
1217 video_title = sanitize_title(video_title)
1219 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1221 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1223 video_uploader = mobj.group(1)
1226 # Process video information
1227 self._downloader.process_info({
1228 'id': video_id.decode('utf-8'),
1229 'url': video_url.decode('utf-8'),
1230 'uploader': video_uploader.decode('utf-8'),
1231 'upload_date': u'NA',
1232 'title': video_title,
1233 'stitle': simple_title,
1234 'ext': video_extension.decode('utf-8'),
1238 except UnavailableVideoError:
1239 self._downloader.trouble(u'ERROR: unable to download video')
1241 class GoogleIE(InfoExtractor):
1242 """Information extractor for video.google.com."""
1244 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1246 def __init__(self, downloader=None):
1247 InfoExtractor.__init__(self, downloader)
1251 return (re.match(GoogleIE._VALID_URL, url) is not None)
1253 def report_download_webpage(self, video_id):
1254 """Report webpage download."""
1255 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1257 def report_extraction(self, video_id):
1258 """Report information extraction."""
1259 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1261 def _real_initialize(self):
1264 def _real_extract(self, url):
1265 # Extract id from URL
1266 mobj = re.match(self._VALID_URL, url)
1268 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1271 # At this point we have a new video
1272 self._downloader.increment_downloads()
1273 video_id = mobj.group(1)
1275 video_extension = 'mp4'
1277 # Retrieve video webpage to extract further information
1278 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1280 self.report_download_webpage(video_id)
1281 webpage = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1286 # Extract URL, uploader, and title from webpage
1287 self.report_extraction(video_id)
1288 mobj = re.search(r"download_url:'([^']+)'", webpage)
1290 video_extension = 'flv'
1291 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1293 self._downloader.trouble(u'ERROR: unable to extract media URL')
1295 mediaURL = urllib.unquote(mobj.group(1))
1296 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1297 mediaURL = mediaURL.replace('\\x26', '\x26')
1299 video_url = mediaURL
1301 mobj = re.search(r'<title>(.*)</title>', webpage)
1303 self._downloader.trouble(u'ERROR: unable to extract title')
1305 video_title = mobj.group(1).decode('utf-8')
1306 video_title = sanitize_title(video_title)
1307 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1309 # Extract video description
1310 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1312 self._downloader.trouble(u'ERROR: unable to extract video description')
1314 video_description = mobj.group(1).decode('utf-8')
1315 if not video_description:
1316 video_description = 'No description available.'
1318 # Extract video thumbnail
1319 if self._downloader.params.get('forcethumbnail', False):
1320 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1322 webpage = urllib2.urlopen(request).read()
1323 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1326 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1328 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1330 video_thumbnail = mobj.group(1)
1331 else: # we need something to pass to process_info
1332 video_thumbnail = ''
1336 # Process video information
1337 self._downloader.process_info({
1338 'id': video_id.decode('utf-8'),
1339 'url': video_url.decode('utf-8'),
1341 'upload_date': u'NA',
1342 'title': video_title,
1343 'stitle': simple_title,
1344 'ext': video_extension.decode('utf-8'),
1348 except UnavailableVideoError:
1349 self._downloader.trouble(u'ERROR: unable to download video')
1352 class PhotobucketIE(InfoExtractor):
1353 """Information extractor for photobucket.com."""
1355 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1357 def __init__(self, downloader=None):
1358 InfoExtractor.__init__(self, downloader)
1362 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1364 def report_download_webpage(self, video_id):
1365 """Report webpage download."""
1366 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1368 def report_extraction(self, video_id):
1369 """Report information extraction."""
1370 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1372 def _real_initialize(self):
1375 def _real_extract(self, url):
1376 # Extract id from URL
1377 mobj = re.match(self._VALID_URL, url)
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 # At this point we have a new video
1383 self._downloader.increment_downloads()
1384 video_id = mobj.group(1)
1386 video_extension = 'flv'
1388 # Retrieve video webpage to extract further information
1389 request = urllib2.Request(url)
1391 self.report_download_webpage(video_id)
1392 webpage = urllib2.urlopen(request).read()
1393 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1397 # Extract URL, uploader, and title from webpage
1398 self.report_extraction(video_id)
1399 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1401 self._downloader.trouble(u'ERROR: unable to extract media URL')
1403 mediaURL = urllib.unquote(mobj.group(1))
1405 video_url = mediaURL
1407 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1409 self._downloader.trouble(u'ERROR: unable to extract title')
1411 video_title = mobj.group(1).decode('utf-8')
1412 video_title = sanitize_title(video_title)
1413 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1415 video_uploader = mobj.group(2).decode('utf-8')
1418 # Process video information
1419 self._downloader.process_info({
1420 'id': video_id.decode('utf-8'),
1421 'url': video_url.decode('utf-8'),
1422 'uploader': video_uploader,
1423 'upload_date': u'NA',
1424 'title': video_title,
1425 'stitle': simple_title,
1426 'ext': video_extension.decode('utf-8'),
1430 except UnavailableVideoError:
1431 self._downloader.trouble(u'ERROR: unable to download video')
1434 class YahooIE(InfoExtractor):
1435 """Information extractor for video.yahoo.com."""
1437 # _VALID_URL matches all Yahoo! Video URLs
1438 # _VPAGE_URL matches only the extractable '/watch/' URLs
1439 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1440 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1442 def __init__(self, downloader=None):
1443 InfoExtractor.__init__(self, downloader)
1447 return (re.match(YahooIE._VALID_URL, url) is not None)
1449 def report_download_webpage(self, video_id):
1450 """Report webpage download."""
1451 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1453 def report_extraction(self, video_id):
1454 """Report information extraction."""
1455 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1457 def _real_initialize(self):
1460 def _real_extract(self, url, new_video=True):
1461 # Extract ID from URL
1462 mobj = re.match(self._VALID_URL, url)
1464 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1467 # At this point we have a new video
1468 self._downloader.increment_downloads()
1469 video_id = mobj.group(2)
1470 video_extension = 'flv'
1472 # Rewrite valid but non-extractable URLs as
1473 # extractable English language /watch/ URLs
1474 if re.match(self._VPAGE_URL, url) is None:
1475 request = urllib2.Request(url)
1477 webpage = urllib2.urlopen(request).read()
1478 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1482 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1484 self._downloader.trouble(u'ERROR: Unable to extract id field')
1486 yahoo_id = mobj.group(1)
1488 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1490 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1492 yahoo_vid = mobj.group(1)
1494 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1495 return self._real_extract(url, new_video=False)
1497 # Retrieve video webpage to extract further information
1498 request = urllib2.Request(url)
1500 self.report_download_webpage(video_id)
1501 webpage = urllib2.urlopen(request).read()
1502 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1506 # Extract uploader and title from webpage
1507 self.report_extraction(video_id)
1508 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1510 self._downloader.trouble(u'ERROR: unable to extract video title')
1512 video_title = mobj.group(1).decode('utf-8')
1513 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1515 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1517 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1519 video_uploader = mobj.group(1).decode('utf-8')
1521 # Extract video thumbnail
1522 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1524 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1526 video_thumbnail = mobj.group(1).decode('utf-8')
1528 # Extract video description
1529 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1531 self._downloader.trouble(u'ERROR: unable to extract video description')
1533 video_description = mobj.group(1).decode('utf-8')
1534 if not video_description: video_description = 'No description available.'
1536 # Extract video height and width
1537 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1539 self._downloader.trouble(u'ERROR: unable to extract video height')
1541 yv_video_height = mobj.group(1)
1543 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1545 self._downloader.trouble(u'ERROR: unable to extract video width')
1547 yv_video_width = mobj.group(1)
1549 # Retrieve video playlist to extract media URL
1550 # I'm not completely sure what all these options are, but we
1551 # seem to need most of them, otherwise the server sends a 401.
1552 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1553 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1554 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1555 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1556 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1558 self.report_download_webpage(video_id)
1559 webpage = urllib2.urlopen(request).read()
1560 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1564 # Extract media URL from playlist XML
1565 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1567 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1569 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1570 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1573 # Process video information
1574 self._downloader.process_info({
1575 'id': video_id.decode('utf-8'),
1577 'uploader': video_uploader,
1578 'upload_date': u'NA',
1579 'title': video_title,
1580 'stitle': simple_title,
1581 'ext': video_extension.decode('utf-8'),
1582 'thumbnail': video_thumbnail.decode('utf-8'),
1583 'description': video_description,
1584 'thumbnail': video_thumbnail,
1585 'description': video_description,
1588 except UnavailableVideoError:
1589 self._downloader.trouble(u'ERROR: unable to download video')
1592 class GenericIE(InfoExtractor):
1593 """Generic last-resort information extractor."""
1595 def __init__(self, downloader=None):
1596 InfoExtractor.__init__(self, downloader)
1602 def report_download_webpage(self, video_id):
1603 """Report webpage download."""
1604 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1605 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1607 def report_extraction(self, video_id):
1608 """Report information extraction."""
1609 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1611 def _real_initialize(self):
1614 def _real_extract(self, url):
1615 # At this point we have a new video
1616 self._downloader.increment_downloads()
1618 video_id = url.split('/')[-1]
1619 request = urllib2.Request(url)
1621 self.report_download_webpage(video_id)
1622 webpage = urllib2.urlopen(request).read()
1623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1626 except ValueError, err:
1627 # since this is the last-resort InfoExtractor, if
1628 # this error is thrown, it'll be thrown here
1629 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1632 self.report_extraction(video_id)
1633 # Start with something easy: JW Player in SWFObject
1634 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1636 # Broaden the search a little bit
1637 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1639 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1642 # It's possible that one of the regexes
1643 # matched, but returned an empty group:
1644 if mobj.group(1) is None:
1645 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1648 video_url = urllib.unquote(mobj.group(1))
1649 video_id = os.path.basename(video_url)
1651 # here's a fun little line of code for you:
1652 video_extension = os.path.splitext(video_id)[1][1:]
1653 video_id = os.path.splitext(video_id)[0]
1655 # it's tempting to parse this further, but you would
1656 # have to take into account all the variations like
1657 # Video Title - Site Name
1658 # Site Name | Video Title
1659 # Video Title - Tagline | Site Name
1660 # and so on and so forth; it's just not practical
1661 mobj = re.search(r'<title>(.*)</title>', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1665 video_title = mobj.group(1).decode('utf-8')
1666 video_title = sanitize_title(video_title)
1667 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1669 # video uploader is domain name
1670 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1672 self._downloader.trouble(u'ERROR: unable to extract title')
1674 video_uploader = mobj.group(1).decode('utf-8')
1677 # Process video information
1678 self._downloader.process_info({
1679 'id': video_id.decode('utf-8'),
1680 'url': video_url.decode('utf-8'),
1681 'uploader': video_uploader,
1682 'upload_date': u'NA',
1683 'title': video_title,
1684 'stitle': simple_title,
1685 'ext': video_extension.decode('utf-8'),
1689 except UnavailableVideoError, err:
1690 self._downloader.trouble(u'ERROR: unable to download video')
1693 class YoutubeSearchIE(InfoExtractor):
1694 """Information Extractor for YouTube search queries."""
1695 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1696 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1697 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1698 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1700 _max_youtube_results = 1000
1702 def __init__(self, youtube_ie, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1704 self._youtube_ie = youtube_ie
1708 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1710 def report_download_page(self, query, pagenum):
1711 """Report attempt to download playlist page with given number."""
1712 query = query.decode(preferredencoding())
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1715 def _real_initialize(self):
1716 self._youtube_ie.initialize()
1718 def _real_extract(self, query):
1719 mobj = re.match(self._VALID_QUERY, query)
1721 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1724 prefix, query = query.split(':')
1726 query = query.encode('utf-8')
1728 self._download_n_results(query, 1)
1730 elif prefix == 'all':
1731 self._download_n_results(query, self._max_youtube_results)
1737 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1739 elif n > self._max_youtube_results:
1740 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1741 n = self._max_youtube_results
1742 self._download_n_results(query, n)
1744 except ValueError: # parsing prefix as integer fails
1745 self._download_n_results(query, 1)
1748 def _download_n_results(self, query, n):
1749 """Downloads a specified number of results for a query"""
1752 already_seen = set()
1756 self.report_download_page(query, pagenum)
1757 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1758 request = urllib2.Request(result_url, None, std_headers)
1760 page = urllib2.urlopen(request).read()
1761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1765 # Extract video identifiers
1766 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1767 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1768 if video_id not in already_seen:
1769 video_ids.append(video_id)
1770 already_seen.add(video_id)
1771 if len(video_ids) == n:
1772 # Specified n videos reached
1773 for id in video_ids:
1774 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1777 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1778 for id in video_ids:
1779 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1782 pagenum = pagenum + 1
1784 class GoogleSearchIE(InfoExtractor):
1785 """Information Extractor for Google Video search queries."""
1786 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1787 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1788 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1789 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1791 _max_google_results = 1000
1793 def __init__(self, google_ie, downloader=None):
1794 InfoExtractor.__init__(self, downloader)
1795 self._google_ie = google_ie
1799 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1801 def report_download_page(self, query, pagenum):
1802 """Report attempt to download playlist page with given number."""
1803 query = query.decode(preferredencoding())
1804 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1806 def _real_initialize(self):
1807 self._google_ie.initialize()
1809 def _real_extract(self, query):
1810 mobj = re.match(self._VALID_QUERY, query)
1812 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1815 prefix, query = query.split(':')
1817 query = query.encode('utf-8')
1819 self._download_n_results(query, 1)
1821 elif prefix == 'all':
1822 self._download_n_results(query, self._max_google_results)
1828 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1830 elif n > self._max_google_results:
1831 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1832 n = self._max_google_results
1833 self._download_n_results(query, n)
1835 except ValueError: # parsing prefix as integer fails
1836 self._download_n_results(query, 1)
1839 def _download_n_results(self, query, n):
1840 """Downloads a specified number of results for a query"""
1843 already_seen = set()
1847 self.report_download_page(query, pagenum)
1848 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1849 request = urllib2.Request(result_url, None, std_headers)
1851 page = urllib2.urlopen(request).read()
1852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1856 # Extract video identifiers
1857 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1858 video_id = mobj.group(1)
1859 if video_id not in already_seen:
1860 video_ids.append(video_id)
1861 already_seen.add(video_id)
1862 if len(video_ids) == n:
1863 # Specified n videos reached
1864 for id in video_ids:
1865 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1868 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1869 for id in video_ids:
1870 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1873 pagenum = pagenum + 1
1875 class YahooSearchIE(InfoExtractor):
1876 """Information Extractor for Yahoo! Video search queries."""
1877 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1878 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1879 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1880 _MORE_PAGES_INDICATOR = r'\s*Next'
1882 _max_yahoo_results = 1000
1884 def __init__(self, yahoo_ie, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1886 self._yahoo_ie = yahoo_ie
1890 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1892 def report_download_page(self, query, pagenum):
1893 """Report attempt to download playlist page with given number."""
1894 query = query.decode(preferredencoding())
1895 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1897 def _real_initialize(self):
1898 self._yahoo_ie.initialize()
1900 def _real_extract(self, query):
1901 mobj = re.match(self._VALID_QUERY, query)
1903 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1906 prefix, query = query.split(':')
1908 query = query.encode('utf-8')
1910 self._download_n_results(query, 1)
1912 elif prefix == 'all':
1913 self._download_n_results(query, self._max_yahoo_results)
1919 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1921 elif n > self._max_yahoo_results:
1922 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1923 n = self._max_yahoo_results
1924 self._download_n_results(query, n)
1926 except ValueError: # parsing prefix as integer fails
1927 self._download_n_results(query, 1)
1930 def _download_n_results(self, query, n):
1931 """Downloads a specified number of results for a query"""
1934 already_seen = set()
1938 self.report_download_page(query, pagenum)
1939 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1940 request = urllib2.Request(result_url, None, std_headers)
1942 page = urllib2.urlopen(request).read()
1943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1947 # Extract video identifiers
1948 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1949 video_id = mobj.group(1)
1950 if video_id not in already_seen:
1951 video_ids.append(video_id)
1952 already_seen.add(video_id)
1953 if len(video_ids) == n:
1954 # Specified n videos reached
1955 for id in video_ids:
1956 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1959 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1960 for id in video_ids:
1961 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1964 pagenum = pagenum + 1
1966 class YoutubePlaylistIE(InfoExtractor):
1967 """Information Extractor for YouTube playlists."""
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1970 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1971 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1972 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1975 def __init__(self, youtube_ie, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977 self._youtube_ie = youtube_ie
1981 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1983 def report_download_page(self, playlist_id, pagenum):
1984 """Report attempt to download playlist page with given number."""
1985 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1987 def _real_initialize(self):
1988 self._youtube_ie.initialize()
1990 def _real_extract(self, url):
1991 # Extract playlist id
1992 mobj = re.match(self._VALID_URL, url)
1994 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1997 # Download playlist pages
1998 playlist_id = mobj.group(1)
2003 self.report_download_page(playlist_id, pagenum)
2004 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2006 page = urllib2.urlopen(request).read()
2007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2011 # Extract video identifiers
2013 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014 if mobj.group(1) not in ids_in_page:
2015 ids_in_page.append(mobj.group(1))
2016 video_ids.extend(ids_in_page)
2018 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2020 pagenum = pagenum + 1
2022 playliststart = self._downloader.params.get('playliststart', 1) - 1
2023 playlistend = self._downloader.params.get('playlistend', -1)
2024 video_ids = video_ids[playliststart:playlistend]
2026 for id in video_ids:
2027 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2030 class YoutubeUserIE(InfoExtractor):
2031 """Information Extractor for YouTube users."""
2033 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2034 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2035 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2038 def __init__(self, youtube_ie, downloader=None):
2039 InfoExtractor.__init__(self, downloader)
2040 self._youtube_ie = youtube_ie
2044 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2046 def report_download_page(self, username):
2047 """Report attempt to download user page."""
2048 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2050 def _real_initialize(self):
2051 self._youtube_ie.initialize()
2053 def _real_extract(self, url):
2055 mobj = re.match(self._VALID_URL, url)
2057 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2060 # Download user page
2061 username = mobj.group(1)
2065 self.report_download_page(username)
2066 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2068 page = urllib2.urlopen(request).read()
2069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2070 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2073 # Extract video identifiers
2076 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077 if mobj.group(1) not in ids_in_page:
2078 ids_in_page.append(mobj.group(1))
2079 video_ids.extend(ids_in_page)
2081 playliststart = self._downloader.params.get('playliststart', 1) - 1
2082 playlistend = self._downloader.params.get('playlistend', -1)
2083 video_ids = video_ids[playliststart:playlistend]
2085 for id in video_ids:
2086 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2089 class DepositFilesIE(InfoExtractor):
2090 """Information extractor for depositfiles.com"""
2092 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2094 def __init__(self, downloader=None):
2095 InfoExtractor.__init__(self, downloader)
2099 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2101 def report_download_webpage(self, file_id):
2102 """Report webpage download."""
2103 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2105 def report_extraction(self, file_id):
2106 """Report information extraction."""
2107 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2109 def _real_initialize(self):
2112 def _real_extract(self, url):
2113 # At this point we have a new file
2114 self._downloader.increment_downloads()
2116 file_id = url.split('/')[-1]
2117 # Rebuild url in english locale
2118 url = 'http://depositfiles.com/en/files/' + file_id
2120 # Retrieve file webpage with 'Free download' button pressed
2121 free_download_indication = { 'gateway_result' : '1' }
2122 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2124 self.report_download_webpage(file_id)
2125 webpage = urllib2.urlopen(request).read()
2126 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2127 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2130 # Search for the real file URL
2131 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2132 if (mobj is None) or (mobj.group(1) is None):
2133 # Try to figure out reason of the error.
2134 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2135 if (mobj is not None) and (mobj.group(1) is not None):
2136 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2137 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2139 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2142 file_url = mobj.group(1)
2143 file_extension = os.path.splitext(file_url)[1][1:]
2145 # Search for file title
2146 mobj = re.search(r'<b title="(.*?)">', webpage)
2148 self._downloader.trouble(u'ERROR: unable to extract title')
2150 file_title = mobj.group(1).decode('utf-8')
2153 # Process file information
2154 self._downloader.process_info({
2155 'id': file_id.decode('utf-8'),
2156 'url': file_url.decode('utf-8'),
2158 'upload_date': u'NA',
2159 'title': file_title,
2160 'stitle': file_title,
2161 'ext': file_extension.decode('utf-8'),
2165 except UnavailableVideoError, err:
2166 self._downloader.trouble(u'ERROR: unable to download file')
2168 class PostProcessor(object):
2169 """Post Processor class.
2171 PostProcessor objects can be added to downloaders with their
2172 add_post_processor() method. When the downloader has finished a
2173 successful download, it will take its internal chain of PostProcessors
2174 and start calling the run() method on each one of them, first with
2175 an initial argument and then with the returned value of the previous
2178 The chain will be stopped if one of them ever returns None or the end
2179 of the chain is reached.
2181 PostProcessor objects follow a "mutual registration" process similar
2182 to InfoExtractor objects.
2187 def __init__(self, downloader=None):
2188 self._downloader = downloader
2190 def set_downloader(self, downloader):
2191 """Sets the downloader for this PP."""
2192 self._downloader = downloader
2194 def run(self, information):
2195 """Run the PostProcessor.
2197 The "information" argument is a dictionary like the ones
2198 composed by InfoExtractors. The only difference is that this
2199 one has an extra field called "filepath" that points to the
2202 When this method returns None, the postprocessing chain is
2203 stopped. However, this method may return an information
2204 dictionary that will be passed to the next postprocessing
2205 object in the chain. It can be the one it received after
2206 changing some fields.
2208 In addition, this method may raise a PostProcessingError
2209 exception that will be taken into account by the downloader
2212 return information # by default, do nothing
2214 ### MAIN PROGRAM ###
2215 if __name__ == '__main__':
2217 # Modules needed only when running the main program
2221 # Function to update the program file with the latest version from bitbucket.org
2222 def update_self(downloader, filename):
2223 # Note: downloader only used for options
2224 if not os.access (filename, os.W_OK):
2225 sys.exit('ERROR: no write permissions on %s' % filename)
2227 downloader.to_screen('Updating to latest stable version...')
2228 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2229 latest_version = urllib.urlopen(latest_url).read().strip()
2230 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2231 newcontent = urllib.urlopen(prog_url).read()
2232 stream = open(filename, 'w')
2233 stream.write(newcontent)
2235 downloader.to_screen('Updated to version %s' % latest_version)
2237 # Parse command line
2238 parser = optparse.OptionParser(
2239 usage='Usage: %prog [options] url...',
2240 version='2010.12.09',
2241 conflict_handler='resolve',
2244 parser.add_option('-h', '--help',
2245 action='help', help='print this help text and exit')
2246 parser.add_option('-v', '--version',
2247 action='version', help='print program version and exit')
2248 parser.add_option('-U', '--update',
2249 action='store_true', dest='update_self', help='update this program to latest stable version')
2250 parser.add_option('-i', '--ignore-errors',
2251 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2252 parser.add_option('-r', '--rate-limit',
2253 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2254 parser.add_option('-R', '--retries',
2255 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2256 parser.add_option('--playlist-start',
2257 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2258 parser.add_option('--playlist-end',
2259 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2261 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2262 authentication.add_option('-u', '--username',
2263 dest='username', metavar='USERNAME', help='account username')
2264 authentication.add_option('-p', '--password',
2265 dest='password', metavar='PASSWORD', help='account password')
2266 authentication.add_option('-n', '--netrc',
2267 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2268 parser.add_option_group(authentication)
2270 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2271 video_format.add_option('-f', '--format',
2272 action='store', dest='format', metavar='FORMAT', help='video format code')
2273 video_format.add_option('-m', '--mobile-version',
2274 action='store_const', dest='format', help='alias for -f 17', const='17')
2275 video_format.add_option('--all-formats',
2276 action='store_const', dest='format', help='download all available video formats', const='-1')
2277 video_format.add_option('--max-quality',
2278 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2279 video_format.add_option('-b', '--best-quality',
2280 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2281 parser.add_option_group(video_format)
2283 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2284 verbosity.add_option('-q', '--quiet',
2285 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2286 verbosity.add_option('-s', '--simulate',
2287 action='store_true', dest='simulate', help='do not download video', default=False)
2288 verbosity.add_option('-g', '--get-url',
2289 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2290 verbosity.add_option('-e', '--get-title',
2291 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2292 verbosity.add_option('--get-thumbnail',
2293 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2294 verbosity.add_option('--get-description',
2295 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2296 verbosity.add_option('--no-progress',
2297 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2298 parser.add_option_group(verbosity)
2300 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2301 filesystem.add_option('-t', '--title',
2302 action='store_true', dest='usetitle', help='use title in file name', default=False)
2303 filesystem.add_option('-l', '--literal',
2304 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2305 filesystem.add_option('-A', '--auto-number',
2306 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2307 filesystem.add_option('-o', '--output',
2308 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2309 filesystem.add_option('-a', '--batch-file',
2310 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2311 filesystem.add_option('-w', '--no-overwrites',
2312 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2313 filesystem.add_option('-c', '--continue',
2314 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2315 filesystem.add_option('--cookies',
2316 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2317 parser.add_option_group(filesystem)
2319 (opts, args) = parser.parse_args()
2321 # Open appropriate CookieJar
2322 if opts.cookiefile is None:
2323 jar = cookielib.CookieJar()
2326 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2327 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2329 except (IOError, OSError), err:
2330 sys.exit(u'ERROR: unable to open cookie file')
2332 # General configuration
2333 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2334 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2335 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2336 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2338 # Batch file verification
2340 if opts.batchfile is not None:
2342 if opts.batchfile == '-':
2345 batchfd = open(opts.batchfile, 'r')
2346 batchurls = batchfd.readlines()
2347 batchurls = [x.strip() for x in batchurls]
2348 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2350 sys.exit(u'ERROR: batch file could not be read')
2351 all_urls = batchurls + args
2353 # Conflicting, missing and erroneous options
2354 if opts.bestquality:
2355 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2356 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2357 parser.error(u'using .netrc conflicts with giving username/password')
2358 if opts.password is not None and opts.username is None:
2359 parser.error(u'account username missing')
2360 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2361 parser.error(u'using output template conflicts with using title, literal title or auto number')
2362 if opts.usetitle and opts.useliteral:
2363 parser.error(u'using title conflicts with using literal title')
2364 if opts.username is not None and opts.password is None:
2365 opts.password = getpass.getpass(u'Type account password and press return:')
2366 if opts.ratelimit is not None:
2367 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2368 if numeric_limit is None:
2369 parser.error(u'invalid rate limit specified')
2370 opts.ratelimit = numeric_limit
2371 if opts.retries is not None:
2373 opts.retries = long(opts.retries)
2374 except (TypeError, ValueError), err:
2375 parser.error(u'invalid retry count specified')
2377 opts.playliststart = long(opts.playliststart)
2378 if opts.playliststart <= 0:
2380 except (TypeError, ValueError), err:
2381 parser.error(u'invalid playlist start number specified')
2383 opts.playlistend = long(opts.playlistend)
2384 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2386 except (TypeError, ValueError), err:
2387 parser.error(u'invalid playlist end number specified')
2389 # Information extractors
2390 youtube_ie = YoutubeIE()
2391 metacafe_ie = MetacafeIE(youtube_ie)
2392 dailymotion_ie = DailymotionIE()
2393 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2394 youtube_user_ie = YoutubeUserIE(youtube_ie)
2395 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2396 google_ie = GoogleIE()
2397 google_search_ie = GoogleSearchIE(google_ie)
2398 photobucket_ie = PhotobucketIE()
2399 yahoo_ie = YahooIE()
2400 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2401 deposit_files_ie = DepositFilesIE()
2402 generic_ie = GenericIE()
2405 fd = FileDownloader({
2406 'usenetrc': opts.usenetrc,
2407 'username': opts.username,
2408 'password': opts.password,
2409 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2410 'forceurl': opts.geturl,
2411 'forcetitle': opts.gettitle,
2412 'forcethumbnail': opts.getthumbnail,
2413 'forcedescription': opts.getdescription,
2414 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2415 'format': opts.format,
2416 'format_limit': opts.format_limit,
2417 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2418 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2419 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2420 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2421 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2422 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2423 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2424 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2425 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2426 or u'%(id)s.%(ext)s'),
2427 'ignoreerrors': opts.ignoreerrors,
2428 'ratelimit': opts.ratelimit,
2429 'nooverwrites': opts.nooverwrites,
2430 'retries': opts.retries,
2431 'continuedl': opts.continue_dl,
2432 'noprogress': opts.noprogress,
2433 'playliststart': opts.playliststart,
2434 'playlistend': opts.playlistend,
2435 'logtostderr': opts.outtmpl == '-',
2437 fd.add_info_extractor(youtube_search_ie)
2438 fd.add_info_extractor(youtube_pl_ie)
2439 fd.add_info_extractor(youtube_user_ie)
2440 fd.add_info_extractor(metacafe_ie)
2441 fd.add_info_extractor(dailymotion_ie)
2442 fd.add_info_extractor(youtube_ie)
2443 fd.add_info_extractor(google_ie)
2444 fd.add_info_extractor(google_search_ie)
2445 fd.add_info_extractor(photobucket_ie)
2446 fd.add_info_extractor(yahoo_ie)
2447 fd.add_info_extractor(yahoo_search_ie)
2448 fd.add_info_extractor(deposit_files_ie)
2450 # This must come last since it's the
2451 # fallback if none of the others work
2452 fd.add_info_extractor(generic_ie)
2455 if opts.update_self:
2456 update_self(fd, sys.argv[0])
2459 if len(all_urls) < 1:
2460 if not opts.update_self:
2461 parser.error(u'you must provide at least one URL')
2464 retcode = fd.download(all_urls)
2466 # Dump cookie jar if requested
2467 if opts.cookiefile is not None:
2470 except (IOError, OSError), err:
2471 sys.exit(u'ERROR: unable to save cookie jar')
2475 except DownloadError:
2477 except SameFileError:
2478 sys.exit(u'ERROR: fixed output name but more than one file to download')
2479 except KeyboardInterrupt:
2480 sys.exit(u'\nERROR: Interrupted by user')