2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
26 # parse_qs was moved from the cgi module to the urlparse module recently.
28 from urlparse import parse_qs
30 from cgi import parse_qs
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Language': 'en-us,en;q=0.5',
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41 def preferredencoding():
42 """Get preferred encoding.
44 Returns the best encoding scheme for the system, based on
45 locale.getpreferredencoding() and some further tweaks.
47 def yield_preferredencoding():
49 pref = locale.getpreferredencoding()
55 return yield_preferredencoding().next()
57 def htmlentity_transform(matchobj):
58 """Transforms an HTML entity to a Unicode character.
60 This function receives a match object and is intended to be used with
61 the re.sub() function.
63 entity = matchobj.group(1)
65 # Known non-numeric HTML entity
66 if entity in htmlentitydefs.name2codepoint:
67 return unichr(htmlentitydefs.name2codepoint[entity])
70 mobj = re.match(ur'(?u)#(x?\d+)', entity)
72 numstr = mobj.group(1)
73 if numstr.startswith(u'x'):
75 numstr = u'0%s' % numstr
78 return unichr(long(numstr, base))
80 # Unknown entity in name, return its literal representation
81 return (u'&%s;' % entity)
83 def sanitize_title(utitle):
84 """Sanitizes a video title so it could be used as part of a filename."""
85 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86 return utitle.replace(unicode(os.sep), u'%')
88 def sanitize_open(filename, open_mode):
89 """Try to open the given filename, and slightly tweak it if this fails.
91 Attempts to open the given filename. If this fails, it tries to change
92 the filename slightly, step by step, until it's either able to open it
93 or it fails and raises a final exception, like the standard open()
96 It returns the tuple (stream, definitive_file_name).
100 if sys.platform == 'win32':
102 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103 return (sys.stdout, filename)
104 stream = open(filename, open_mode)
105 return (stream, filename)
106 except (IOError, OSError), err:
107 # In case of error, try to remove win32 forbidden chars
108 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110 # An exception here should be caught in the caller
111 stream = open(filename, open_mode)
112 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def temp_name(filename):
240 """Returns a temporary filename for the given filename."""
241 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
243 return filename + u'.part'
246 def format_bytes(bytes):
249 if type(bytes) is str:
254 exponent = long(math.log(bytes, 1024.0))
255 suffix = 'bkMGTPEZY'[exponent]
256 converted = float(bytes) / float(1024**exponent)
257 return '%.2f%s' % (converted, suffix)
260 def calc_percent(byte_counter, data_len):
263 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
266 def calc_eta(start, now, total, current):
270 if current == 0 or dif < 0.001: # One millisecond
272 rate = float(current) / dif
273 eta = long((float(total) - float(current)) / rate)
274 (eta_mins, eta_secs) = divmod(eta, 60)
277 return '%02d:%02d' % (eta_mins, eta_secs)
280 def calc_speed(start, now, bytes):
282 if bytes == 0 or dif < 0.001: # One millisecond
283 return '%10s' % '---b/s'
284 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
287 def best_block_size(elapsed_time, bytes):
288 new_min = max(bytes / 2.0, 1.0)
289 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290 if elapsed_time < 0.001:
292 rate = bytes / elapsed_time
300 def parse_bytes(bytestr):
301 """Parse a string indicating a byte quantity into a long integer."""
302 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
305 number = float(matchobj.group(1))
306 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307 return long(round(number * multiplier))
309 def add_info_extractor(self, ie):
310 """Add an InfoExtractor object to the end of the list."""
312 ie.set_downloader(self)
314 def add_post_processor(self, pp):
315 """Add a PostProcessor object to the end of the chain."""
317 pp.set_downloader(self)
319 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320 """Print message to stdout if not in quiet mode."""
322 if not self.params.get('quiet', False):
323 terminator = [u'\n', u''][skip_eol]
324 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325 self._screen_file.flush()
326 except (UnicodeEncodeError), err:
327 if not ignore_encoding_errors:
330 def to_stderr(self, message):
331 """Print message to stderr."""
332 print >>sys.stderr, message.encode(preferredencoding())
334 def fixed_template(self):
335 """Checks if the output template is fixed."""
336 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
338 def trouble(self, message=None):
339 """Determine action to take when a download problem appears.
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
345 if message is not None:
346 self.to_stderr(message)
347 if not self.params.get('ignoreerrors', False):
348 raise DownloadError(message)
349 self._download_retcode = 1
351 def slow_down(self, start_time, byte_counter):
352 """Sleep if the download speed is over the rate limit."""
353 rate_limit = self.params.get('ratelimit', None)
354 if rate_limit is None or byte_counter == 0:
357 elapsed = now - start_time
360 speed = float(byte_counter) / elapsed
361 if speed > rate_limit:
362 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
364 def try_rename(self, old_filename, new_filename):
366 if old_filename == new_filename:
368 os.rename(old_filename, new_filename)
369 except (IOError, OSError), err:
370 self.trouble(u'ERROR: unable to rename file')
372 def report_destination(self, filename):
373 """Report destination filename."""
374 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
376 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377 """Report download progress."""
378 if self.params.get('noprogress', False):
380 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
383 def report_resuming_byte(self, resume_len):
384 """Report attempt to resume at given byte."""
385 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
387 def report_retry(self, count, retries):
388 """Report retry in case of HTTP error 5xx"""
389 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
391 def report_file_already_downloaded(self, file_name):
392 """Report file has already been fully downloaded."""
394 self.to_screen(u'[download] %s has already been downloaded' % file_name)
395 except (UnicodeEncodeError), err:
396 self.to_screen(u'[download] The file has already been downloaded')
398 def report_unable_to_resume(self):
399 """Report it was impossible to resume download."""
400 self.to_screen(u'[download] Unable to resume')
402 def report_finish(self):
403 """Report download finished."""
404 if self.params.get('noprogress', False):
405 self.to_screen(u'[download] Download completed')
409 def increment_downloads(self):
410 """Increment the ordinal that assigns a number to each file."""
411 self._num_downloads += 1
413 def process_info(self, info_dict):
414 """Process a single dictionary returned by an InfoExtractor."""
415 # Do nothing else if in simulate mode
416 if self.params.get('simulate', False):
418 if self.params.get('forcetitle', False):
419 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self.params.get('forceurl', False):
421 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424 if self.params.get('forcedescription', False) and 'description' in info_dict:
425 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
430 template_dict = dict(info_dict)
431 template_dict['epoch'] = unicode(long(time.time()))
432 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433 filename = self.params['outtmpl'] % template_dict
434 except (ValueError, KeyError), err:
435 self.trouble(u'ERROR: invalid system charset or erroneous output template')
437 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438 self.to_stderr(u'WARNING: file exists and will be skipped')
442 self.pmkdir(filename)
443 except (OSError, IOError), err:
444 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
448 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449 except (OSError, IOError), err:
450 raise UnavailableVideoError
451 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
454 except (ContentTooShortError, ), err:
455 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
460 self.post_process(filename, info_dict)
461 except (PostProcessingError), err:
462 self.trouble(u'ERROR: postprocessing: %s' % str(err))
465 def download(self, url_list):
466 """Download a given list of URLs."""
467 if len(url_list) > 1 and self.fixed_template():
468 raise SameFileError(self.params['outtmpl'])
471 suitable_found = False
473 # Go to next InfoExtractor if not suitable
474 if not ie.suitable(url):
477 # Suitable InfoExtractor found
478 suitable_found = True
480 # Extract information from URL and process it
483 # Suitable InfoExtractor had been found; go to next URL
486 if not suitable_found:
487 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
489 return self._download_retcode
491 def post_process(self, filename, ie_info):
492 """Run the postprocessing chain on the given file."""
494 info['filepath'] = filename
500 def _download_with_rtmpdump(self, filename, url, player_url):
501 self.report_destination(filename)
502 tmpfilename = self.temp_name(filename)
504 # Check for rtmpdump first
506 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507 except (OSError, IOError):
508 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
511 # Download using rtmpdump. rtmpdump returns exit code 2 when
512 # the connection was interrumpted and resuming appears to be
513 # possible. This is part of rtmpdump's normal usage, AFAIK.
514 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516 while retval == 2 or retval == 1:
517 prevsize = os.path.getsize(tmpfilename)
518 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519 time.sleep(5.0) # This seems to be needed
520 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521 cursize = os.path.getsize(tmpfilename)
522 if prevsize == cursize and retval == 1:
525 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526 self.try_rename(tmpfilename, filename)
529 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
532 def _do_download(self, filename, url, player_url):
533 # Check file already present
534 if self.params.get('continuedl', False) and os.path.isfile(filename):
535 self.report_file_already_downloaded(filename)
538 # Attempt to download using rtmpdump
539 if url.startswith('rtmp'):
540 return self._download_with_rtmpdump(filename, url, player_url)
542 tmpfilename = self.temp_name(filename)
545 basic_request = urllib2.Request(url, None, std_headers)
546 request = urllib2.Request(url, None, std_headers)
548 # Establish possible resume length
549 if os.path.isfile(tmpfilename):
550 resume_len = os.path.getsize(tmpfilename)
554 # Request parameters in case of being able to resume
555 if self.params.get('continuedl', False) and resume_len != 0:
556 self.report_resuming_byte(resume_len)
557 request.add_header('Range','bytes=%d-' % resume_len)
561 retries = self.params.get('retries', 0)
562 while count <= retries:
563 # Establish connection
565 data = urllib2.urlopen(request)
567 except (urllib2.HTTPError, ), err:
568 if (err.code < 500 or err.code >= 600) and err.code != 416:
569 # Unexpected HTTP error
571 elif err.code == 416:
572 # Unable to resume (requested range not satisfiable)
574 # Open the connection again without the range header
575 data = urllib2.urlopen(basic_request)
576 content_length = data.info()['Content-Length']
577 except (urllib2.HTTPError, ), err:
578 if err.code < 500 or err.code >= 600:
581 # Examine the reported length
582 if (content_length is not None and
583 (resume_len - 100 < long(content_length) < resume_len + 100)):
584 # The file had already been fully downloaded.
585 # Explanation to the above condition: in issue #175 it was revealed that
586 # YouTube sometimes adds or removes a few bytes from the end of the file,
587 # changing the file size slightly and causing problems for some users. So
588 # I decided to implement a suggested change and consider the file
589 # completely downloaded if the file size differs less than 100 bytes from
590 # the one in the hard drive.
591 self.report_file_already_downloaded(filename)
592 self.try_rename(tmpfilename, filename)
595 # The length does not match, we start the download over
596 self.report_unable_to_resume()
602 self.report_retry(count, retries)
605 self.trouble(u'ERROR: giving up after %s retries' % retries)
608 data_len = data.info().get('Content-length', None)
609 data_len_str = self.format_bytes(data_len)
616 data_block = data.read(block_size)
618 data_block_len = len(data_block)
619 if data_block_len == 0:
621 byte_counter += data_block_len
623 # Open file just in time
626 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
627 self.report_destination(filename)
628 except (OSError, IOError), err:
629 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
632 stream.write(data_block)
633 except (IOError, OSError), err:
634 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
636 block_size = self.best_block_size(after - before, data_block_len)
639 percent_str = self.calc_percent(byte_counter, data_len)
640 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
641 speed_str = self.calc_speed(start, time.time(), byte_counter)
642 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
645 self.slow_down(start, byte_counter)
649 if data_len is not None and str(byte_counter) != data_len:
650 raise ContentTooShortError(byte_counter, long(data_len))
651 self.try_rename(tmpfilename, filename)
654 class InfoExtractor(object):
655 """Information Extractor class.
657 Information extractors are the classes that, given a URL, extract
658 information from the video (or videos) the URL refers to. This
659 information includes the real video URL, the video title and simplified
660 title, author and others. The information is stored in a dictionary
661 which is then passed to the FileDownloader. The FileDownloader
662 processes this information possibly downloading the video to the file
663 system, among other possible outcomes. The dictionaries must include
664 the following fields:
666 id: Video identifier.
667 url: Final video URL.
668 uploader: Nickname of the video uploader.
669 title: Literal title.
670 stitle: Simplified title.
671 ext: Video filename extension.
672 format: Video format.
673 player_url: SWF Player URL (may be None).
675 The following fields are optional. Their primary purpose is to allow
676 youtube-dl to serve as the backend for a video search function, such
677 as the one in youtube2mp3. They are only used when their respective
678 forced printing functions are called:
680 thumbnail: Full URL to a video thumbnail image.
681 description: One-line video description.
683 Subclasses of this one should re-define the _real_initialize() and
684 _real_extract() methods, as well as the suitable() static method.
685 Probably, they should also be instantiated and added to the main
692 def __init__(self, downloader=None):
693 """Constructor. Receives an optional downloader."""
695 self.set_downloader(downloader)
699 """Receives a URL and returns True if suitable for this IE."""
702 def initialize(self):
703 """Initializes an instance (authentication, etc)."""
705 self._real_initialize()
708 def extract(self, url):
709 """Extracts URL information and returns it in list of dicts."""
711 return self._real_extract(url)
713 def set_downloader(self, downloader):
714 """Sets the downloader for this IE."""
715 self._downloader = downloader
717 def _real_initialize(self):
718 """Real initialization process. Redefine in subclasses."""
721 def _real_extract(self, url):
722 """Real extraction process. Redefine in subclasses."""
725 class YoutubeIE(InfoExtractor):
726 """Information extractor for youtube.com."""
728 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
729 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
730 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
731 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
732 _NETRC_MACHINE = 'youtube'
733 # Listed in order of quality
734 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
735 _video_extensions = {
741 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
748 return (re.match(YoutubeIE._VALID_URL, url) is not None)
750 def report_lang(self):
751 """Report attempt to set language."""
752 self._downloader.to_screen(u'[youtube] Setting language')
754 def report_login(self):
755 """Report attempt to log in."""
756 self._downloader.to_screen(u'[youtube] Logging in')
758 def report_age_confirmation(self):
759 """Report attempt to confirm age."""
760 self._downloader.to_screen(u'[youtube] Confirming age')
762 def report_video_webpage_download(self, video_id):
763 """Report attempt to download video webpage."""
764 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
766 def report_video_info_webpage_download(self, video_id):
767 """Report attempt to download video info webpage."""
768 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
770 def report_information_extraction(self, video_id):
771 """Report attempt to extract video information."""
772 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
774 def report_unavailable_format(self, video_id, format):
775 """Report extracted video URL."""
776 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
778 def report_rtmp_download(self):
779 """Indicate the download will use the RTMP protocol."""
780 self._downloader.to_screen(u'[youtube] RTMP download detected')
782 def _real_initialize(self):
783 if self._downloader is None:
788 downloader_params = self._downloader.params
790 # Attempt to use provided username and password or .netrc data
791 if downloader_params.get('username', None) is not None:
792 username = downloader_params['username']
793 password = downloader_params['password']
794 elif downloader_params.get('usenetrc', False):
796 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
801 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
802 except (IOError, netrc.NetrcParseError), err:
803 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
807 request = urllib2.Request(self._LANG_URL, None, std_headers)
810 urllib2.urlopen(request).read()
811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
812 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
815 # No authentication to be performed
821 'current_form': 'loginForm',
823 'action_login': 'Log In',
824 'username': username,
825 'password': password,
827 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
830 login_results = urllib2.urlopen(request).read()
831 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
832 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
834 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
835 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
841 'action_confirm': 'Confirm',
843 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
845 self.report_age_confirmation()
846 age_results = urllib2.urlopen(request).read()
847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
851 def _real_extract(self, url):
852 # Extract video id from URL
853 mobj = re.match(self._VALID_URL, url)
855 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
857 video_id = mobj.group(2)
860 self.report_video_webpage_download(video_id)
861 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
863 video_webpage = urllib2.urlopen(request).read()
864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
865 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
868 # Attempt to extract SWF player URL
869 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
871 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
876 self.report_video_info_webpage_download(video_id)
877 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
878 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
879 % (video_id, el_type))
880 request = urllib2.Request(video_info_url, None, std_headers)
882 video_info_webpage = urllib2.urlopen(request).read()
883 video_info = parse_qs(video_info_webpage)
884 if 'token' in video_info:
886 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
887 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
889 if 'token' not in video_info:
890 if 'reason' in video_info:
891 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
893 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
896 # Start extracting information
897 self.report_information_extraction(video_id)
900 if 'author' not in video_info:
901 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
903 video_uploader = urllib.unquote_plus(video_info['author'][0])
906 if 'title' not in video_info:
907 self._downloader.trouble(u'ERROR: unable to extract video title')
909 video_title = urllib.unquote_plus(video_info['title'][0])
910 video_title = video_title.decode('utf-8')
911 video_title = sanitize_title(video_title)
914 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
915 simple_title = simple_title.strip(ur'_')
918 if 'thumbnail_url' not in video_info:
919 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
921 else: # don't panic if we can't find it
922 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
926 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
928 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
929 format_expressions = ['%d %B %Y', '%B %d %Y']
930 for expression in format_expressions:
932 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
937 video_description = 'No description available.'
938 if self._downloader.params.get('forcedescription', False):
939 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
941 video_description = mobj.group(1)
944 video_token = urllib.unquote_plus(video_info['token'][0])
946 # Decide which formats to download
947 requested_format = self._downloader.params.get('format', None)
948 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
950 if 'fmt_url_map' in video_info:
951 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
952 format_limit = self._downloader.params.get('format_limit', None)
953 if format_limit is not None and format_limit in self._available_formats:
954 format_list = self._available_formats[self._available_formats.index(format_limit):]
956 format_list = self._available_formats
957 existing_formats = [x for x in format_list if x in url_map]
958 if len(existing_formats) == 0:
959 self._downloader.trouble(u'ERROR: no known formats available for video')
961 if requested_format is None:
962 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
963 elif requested_format == '-1':
964 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
966 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
968 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
969 self.report_rtmp_download()
970 video_url_list = [(None, video_info['conn'][0])]
973 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
976 for format_param, video_real_url in video_url_list:
977 # At this point we have a new video
978 self._downloader.increment_downloads()
981 video_extension = self._video_extensions.get(format_param, 'flv')
983 # Find the video URL in fmt_url_map or conn paramters
985 # Process video information
986 self._downloader.process_info({
987 'id': video_id.decode('utf-8'),
988 'url': video_real_url.decode('utf-8'),
989 'uploader': video_uploader.decode('utf-8'),
990 'upload_date': upload_date,
991 'title': video_title,
992 'stitle': simple_title,
993 'ext': video_extension.decode('utf-8'),
994 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
995 'thumbnail': video_thumbnail.decode('utf-8'),
996 'description': video_description.decode('utf-8'),
997 'player_url': player_url,
999 except UnavailableVideoError, err:
1000 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
1003 class MetacafeIE(InfoExtractor):
1004 """Information Extractor for metacafe.com."""
1006 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1007 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1008 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1011 def __init__(self, youtube_ie, downloader=None):
1012 InfoExtractor.__init__(self, downloader)
1013 self._youtube_ie = youtube_ie
1017 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1019 def report_disclaimer(self):
1020 """Report disclaimer retrieval."""
1021 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1023 def report_age_confirmation(self):
1024 """Report attempt to confirm age."""
1025 self._downloader.to_screen(u'[metacafe] Confirming age')
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1035 def _real_initialize(self):
1036 # Retrieve disclaimer
1037 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1039 self.report_disclaimer()
1040 disclaimer = urllib2.urlopen(request).read()
1041 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1042 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1048 'submit': "Continue - I'm over 18",
1050 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1052 self.report_age_confirmation()
1053 disclaimer = urllib2.urlopen(request).read()
1054 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1055 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1058 def _real_extract(self, url):
1059 # Extract id and simplified title from URL
1060 mobj = re.match(self._VALID_URL, url)
1062 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1065 video_id = mobj.group(1)
1067 # Check if video comes from YouTube
1068 mobj2 = re.match(r'^yt-(.*)$', video_id)
1069 if mobj2 is not None:
1070 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1073 # At this point we have a new video
1074 self._downloader.increment_downloads()
1076 simple_title = mobj.group(2).decode('utf-8')
1078 # Retrieve video webpage to extract further information
1079 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1081 self.report_download_webpage(video_id)
1082 webpage = urllib2.urlopen(request).read()
1083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1084 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1087 # Extract URL, uploader and title from webpage
1088 self.report_extraction(video_id)
1089 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1090 if mobj is not None:
1091 mediaURL = urllib.unquote(mobj.group(1))
1092 video_extension = mediaURL[-3:]
1094 # Extract gdaKey if available
1095 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1097 video_url = mediaURL
1099 gdaKey = mobj.group(1)
1100 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1102 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1104 self._downloader.trouble(u'ERROR: unable to extract media URL')
1106 vardict = parse_qs(mobj.group(1))
1107 if 'mediaData' not in vardict:
1108 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1112 self._downloader.trouble(u'ERROR: unable to extract media URL')
1114 mediaURL = mobj.group(1).replace('\\/', '/')
1115 video_extension = mediaURL[-3:]
1116 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1118 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1120 self._downloader.trouble(u'ERROR: unable to extract title')
1122 video_title = mobj.group(1).decode('utf-8')
1123 video_title = sanitize_title(video_title)
1125 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1127 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1129 video_uploader = mobj.group(1)
1132 # Process video information
1133 self._downloader.process_info({
1134 'id': video_id.decode('utf-8'),
1135 'url': video_url.decode('utf-8'),
1136 'uploader': video_uploader.decode('utf-8'),
1137 'upload_date': u'NA',
1138 'title': video_title,
1139 'stitle': simple_title,
1140 'ext': video_extension.decode('utf-8'),
1144 except UnavailableVideoError:
1145 self._downloader.trouble(u'ERROR: unable to download video')
1148 class DailymotionIE(InfoExtractor):
1149 """Information Extractor for Dailymotion"""
1151 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1153 def __init__(self, downloader=None):
1154 InfoExtractor.__init__(self, downloader)
1158 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1160 def report_download_webpage(self, video_id):
1161 """Report webpage download."""
1162 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1164 def report_extraction(self, video_id):
1165 """Report information extraction."""
1166 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1168 def _real_initialize(self):
1171 def _real_extract(self, url):
1172 # Extract id and simplified title from URL
1173 mobj = re.match(self._VALID_URL, url)
1175 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1178 # At this point we have a new video
1179 self._downloader.increment_downloads()
1180 video_id = mobj.group(1)
1182 simple_title = mobj.group(2).decode('utf-8')
1183 video_extension = 'flv'
1185 # Retrieve video webpage to extract further information
1186 request = urllib2.Request(url)
1188 self.report_download_webpage(video_id)
1189 webpage = urllib2.urlopen(request).read()
1190 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1191 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1194 # Extract URL, uploader and title from webpage
1195 self.report_extraction(video_id)
1196 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1198 self._downloader.trouble(u'ERROR: unable to extract media URL')
1200 mediaURL = urllib.unquote(mobj.group(1))
1202 # if needed add http://www.dailymotion.com/ if relative URL
1204 video_url = mediaURL
1206 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1207 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1209 self._downloader.trouble(u'ERROR: unable to extract title')
1211 video_title = mobj.group(1).decode('utf-8')
1212 video_title = sanitize_title(video_title)
1214 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1216 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1218 video_uploader = mobj.group(1)
1221 # Process video information
1222 self._downloader.process_info({
1223 'id': video_id.decode('utf-8'),
1224 'url': video_url.decode('utf-8'),
1225 'uploader': video_uploader.decode('utf-8'),
1226 'upload_date': u'NA',
1227 'title': video_title,
1228 'stitle': simple_title,
1229 'ext': video_extension.decode('utf-8'),
1233 except UnavailableVideoError:
1234 self._downloader.trouble(u'ERROR: unable to download video')
1236 class GoogleIE(InfoExtractor):
1237 """Information extractor for video.google.com."""
1239 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1241 def __init__(self, downloader=None):
1242 InfoExtractor.__init__(self, downloader)
1246 return (re.match(GoogleIE._VALID_URL, url) is not None)
1248 def report_download_webpage(self, video_id):
1249 """Report webpage download."""
1250 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1252 def report_extraction(self, video_id):
1253 """Report information extraction."""
1254 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1256 def _real_initialize(self):
1259 def _real_extract(self, url):
1260 # Extract id from URL
1261 mobj = re.match(self._VALID_URL, url)
1263 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1266 # At this point we have a new video
1267 self._downloader.increment_downloads()
1268 video_id = mobj.group(1)
1270 video_extension = 'mp4'
1272 # Retrieve video webpage to extract further information
1273 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1275 self.report_download_webpage(video_id)
1276 webpage = urllib2.urlopen(request).read()
1277 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1278 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1281 # Extract URL, uploader, and title from webpage
1282 self.report_extraction(video_id)
1283 mobj = re.search(r"download_url:'([^']+)'", webpage)
1285 video_extension = 'flv'
1286 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1288 self._downloader.trouble(u'ERROR: unable to extract media URL')
1290 mediaURL = urllib.unquote(mobj.group(1))
1291 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1292 mediaURL = mediaURL.replace('\\x26', '\x26')
1294 video_url = mediaURL
1296 mobj = re.search(r'<title>(.*)</title>', webpage)
1298 self._downloader.trouble(u'ERROR: unable to extract title')
1300 video_title = mobj.group(1).decode('utf-8')
1301 video_title = sanitize_title(video_title)
1302 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1304 # Extract video description
1305 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1307 self._downloader.trouble(u'ERROR: unable to extract video description')
1309 video_description = mobj.group(1).decode('utf-8')
1310 if not video_description:
1311 video_description = 'No description available.'
1313 # Extract video thumbnail
1314 if self._downloader.params.get('forcethumbnail', False):
1315 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1317 webpage = urllib2.urlopen(request).read()
1318 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1319 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1321 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1323 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1325 video_thumbnail = mobj.group(1)
1326 else: # we need something to pass to process_info
1327 video_thumbnail = ''
1331 # Process video information
1332 self._downloader.process_info({
1333 'id': video_id.decode('utf-8'),
1334 'url': video_url.decode('utf-8'),
1336 'upload_date': u'NA',
1337 'title': video_title,
1338 'stitle': simple_title,
1339 'ext': video_extension.decode('utf-8'),
1343 except UnavailableVideoError:
1344 self._downloader.trouble(u'ERROR: unable to download video')
1347 class PhotobucketIE(InfoExtractor):
1348 """Information extractor for photobucket.com."""
1350 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1352 def __init__(self, downloader=None):
1353 InfoExtractor.__init__(self, downloader)
1357 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1359 def report_download_webpage(self, video_id):
1360 """Report webpage download."""
1361 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1363 def report_extraction(self, video_id):
1364 """Report information extraction."""
1365 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1367 def _real_initialize(self):
1370 def _real_extract(self, url):
1371 # Extract id from URL
1372 mobj = re.match(self._VALID_URL, url)
1374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377 # At this point we have a new video
1378 self._downloader.increment_downloads()
1379 video_id = mobj.group(1)
1381 video_extension = 'flv'
1383 # Retrieve video webpage to extract further information
1384 request = urllib2.Request(url)
1386 self.report_download_webpage(video_id)
1387 webpage = urllib2.urlopen(request).read()
1388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1389 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1392 # Extract URL, uploader, and title from webpage
1393 self.report_extraction(video_id)
1394 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1396 self._downloader.trouble(u'ERROR: unable to extract media URL')
1398 mediaURL = urllib.unquote(mobj.group(1))
1400 video_url = mediaURL
1402 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_title = mobj.group(1).decode('utf-8')
1407 video_title = sanitize_title(video_title)
1408 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1410 video_uploader = mobj.group(2).decode('utf-8')
1413 # Process video information
1414 self._downloader.process_info({
1415 'id': video_id.decode('utf-8'),
1416 'url': video_url.decode('utf-8'),
1417 'uploader': video_uploader,
1418 'upload_date': u'NA',
1419 'title': video_title,
1420 'stitle': simple_title,
1421 'ext': video_extension.decode('utf-8'),
1425 except UnavailableVideoError:
1426 self._downloader.trouble(u'ERROR: unable to download video')
1429 class YahooIE(InfoExtractor):
1430 """Information extractor for video.yahoo.com."""
1432 # _VALID_URL matches all Yahoo! Video URLs
1433 # _VPAGE_URL matches only the extractable '/watch/' URLs
1434 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1435 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1437 def __init__(self, downloader=None):
1438 InfoExtractor.__init__(self, downloader)
1442 return (re.match(YahooIE._VALID_URL, url) is not None)
1444 def report_download_webpage(self, video_id):
1445 """Report webpage download."""
1446 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1448 def report_extraction(self, video_id):
1449 """Report information extraction."""
1450 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1452 def _real_initialize(self):
1455 def _real_extract(self, url, new_video=True):
1456 # Extract ID from URL
1457 mobj = re.match(self._VALID_URL, url)
1459 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1462 # At this point we have a new video
1463 self._downloader.increment_downloads()
1464 video_id = mobj.group(2)
1465 video_extension = 'flv'
1467 # Rewrite valid but non-extractable URLs as
1468 # extractable English language /watch/ URLs
1469 if re.match(self._VPAGE_URL, url) is None:
1470 request = urllib2.Request(url)
1472 webpage = urllib2.urlopen(request).read()
1473 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1474 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1477 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1479 self._downloader.trouble(u'ERROR: Unable to extract id field')
1481 yahoo_id = mobj.group(1)
1483 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1485 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1487 yahoo_vid = mobj.group(1)
1489 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1490 return self._real_extract(url, new_video=False)
1492 # Retrieve video webpage to extract further information
1493 request = urllib2.Request(url)
1495 self.report_download_webpage(video_id)
1496 webpage = urllib2.urlopen(request).read()
1497 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1501 # Extract uploader and title from webpage
1502 self.report_extraction(video_id)
1503 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1505 self._downloader.trouble(u'ERROR: unable to extract video title')
1507 video_title = mobj.group(1).decode('utf-8')
1508 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1510 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1512 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1514 video_uploader = mobj.group(1).decode('utf-8')
1516 # Extract video thumbnail
1517 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1519 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1521 video_thumbnail = mobj.group(1).decode('utf-8')
1523 # Extract video description
1524 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1526 self._downloader.trouble(u'ERROR: unable to extract video description')
1528 video_description = mobj.group(1).decode('utf-8')
1529 if not video_description: video_description = 'No description available.'
1531 # Extract video height and width
1532 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1534 self._downloader.trouble(u'ERROR: unable to extract video height')
1536 yv_video_height = mobj.group(1)
1538 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1540 self._downloader.trouble(u'ERROR: unable to extract video width')
1542 yv_video_width = mobj.group(1)
1544 # Retrieve video playlist to extract media URL
1545 # I'm not completely sure what all these options are, but we
1546 # seem to need most of them, otherwise the server sends a 401.
1547 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1548 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1549 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1550 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1551 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1553 self.report_download_webpage(video_id)
1554 webpage = urllib2.urlopen(request).read()
1555 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1559 # Extract media URL from playlist XML
1560 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1562 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1564 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1565 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1568 # Process video information
1569 self._downloader.process_info({
1570 'id': video_id.decode('utf-8'),
1572 'uploader': video_uploader,
1573 'upload_date': u'NA',
1574 'title': video_title,
1575 'stitle': simple_title,
1576 'ext': video_extension.decode('utf-8'),
1577 'thumbnail': video_thumbnail.decode('utf-8'),
1578 'description': video_description,
1579 'thumbnail': video_thumbnail,
1580 'description': video_description,
1583 except UnavailableVideoError:
1584 self._downloader.trouble(u'ERROR: unable to download video')
1587 class GenericIE(InfoExtractor):
1588 """Generic last-resort information extractor."""
1590 def __init__(self, downloader=None):
1591 InfoExtractor.__init__(self, downloader)
1597 def report_download_webpage(self, video_id):
1598 """Report webpage download."""
1599 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1600 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1602 def report_extraction(self, video_id):
1603 """Report information extraction."""
1604 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1606 def _real_initialize(self):
1609 def _real_extract(self, url):
1610 # At this point we have a new video
1611 self._downloader.increment_downloads()
1613 video_id = url.split('/')[-1]
1614 request = urllib2.Request(url)
1616 self.report_download_webpage(video_id)
1617 webpage = urllib2.urlopen(request).read()
1618 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1619 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1621 except ValueError, err:
1622 # since this is the last-resort InfoExtractor, if
1623 # this error is thrown, it'll be thrown here
1624 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1627 self.report_extraction(video_id)
1628 # Start with something easy: JW Player in SWFObject
1629 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1631 # Broaden the search a little bit
1632 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1634 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1637 # It's possible that one of the regexes
1638 # matched, but returned an empty group:
1639 if mobj.group(1) is None:
1640 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1643 video_url = urllib.unquote(mobj.group(1))
1644 video_id = os.path.basename(video_url)
1646 # here's a fun little line of code for you:
1647 video_extension = os.path.splitext(video_id)[1][1:]
1648 video_id = os.path.splitext(video_id)[0]
1650 # it's tempting to parse this further, but you would
1651 # have to take into account all the variations like
1652 # Video Title - Site Name
1653 # Site Name | Video Title
1654 # Video Title - Tagline | Site Name
1655 # and so on and so forth; it's just not practical
1656 mobj = re.search(r'<title>(.*)</title>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract title')
1660 video_title = mobj.group(1).decode('utf-8')
1661 video_title = sanitize_title(video_title)
1662 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1664 # video uploader is domain name
1665 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1667 self._downloader.trouble(u'ERROR: unable to extract title')
1669 video_uploader = mobj.group(1).decode('utf-8')
1672 # Process video information
1673 self._downloader.process_info({
1674 'id': video_id.decode('utf-8'),
1675 'url': video_url.decode('utf-8'),
1676 'uploader': video_uploader,
1677 'upload_date': u'NA',
1678 'title': video_title,
1679 'stitle': simple_title,
1680 'ext': video_extension.decode('utf-8'),
1684 except UnavailableVideoError, err:
1685 self._downloader.trouble(u'ERROR: unable to download video')
1688 class YoutubeSearchIE(InfoExtractor):
1689 """Information Extractor for YouTube search queries."""
1690 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1691 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1692 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1693 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1695 _max_youtube_results = 1000
1697 def __init__(self, youtube_ie, downloader=None):
1698 InfoExtractor.__init__(self, downloader)
1699 self._youtube_ie = youtube_ie
1703 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1705 def report_download_page(self, query, pagenum):
1706 """Report attempt to download playlist page with given number."""
1707 query = query.decode(preferredencoding())
1708 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1710 def _real_initialize(self):
1711 self._youtube_ie.initialize()
1713 def _real_extract(self, query):
1714 mobj = re.match(self._VALID_QUERY, query)
1716 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1719 prefix, query = query.split(':')
1721 query = query.encode('utf-8')
1723 self._download_n_results(query, 1)
1725 elif prefix == 'all':
1726 self._download_n_results(query, self._max_youtube_results)
1732 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1734 elif n > self._max_youtube_results:
1735 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1736 n = self._max_youtube_results
1737 self._download_n_results(query, n)
1739 except ValueError: # parsing prefix as integer fails
1740 self._download_n_results(query, 1)
1743 def _download_n_results(self, query, n):
1744 """Downloads a specified number of results for a query"""
1747 already_seen = set()
1751 self.report_download_page(query, pagenum)
1752 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1753 request = urllib2.Request(result_url, None, std_headers)
1755 page = urllib2.urlopen(request).read()
1756 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1757 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1760 # Extract video identifiers
1761 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1762 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1763 if video_id not in already_seen:
1764 video_ids.append(video_id)
1765 already_seen.add(video_id)
1766 if len(video_ids) == n:
1767 # Specified n videos reached
1768 for id in video_ids:
1769 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1772 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1773 for id in video_ids:
1774 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1777 pagenum = pagenum + 1
1779 class GoogleSearchIE(InfoExtractor):
1780 """Information Extractor for Google Video search queries."""
1781 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1782 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1783 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1784 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1786 _max_google_results = 1000
1788 def __init__(self, google_ie, downloader=None):
1789 InfoExtractor.__init__(self, downloader)
1790 self._google_ie = google_ie
1794 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1796 def report_download_page(self, query, pagenum):
1797 """Report attempt to download playlist page with given number."""
1798 query = query.decode(preferredencoding())
1799 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1801 def _real_initialize(self):
1802 self._google_ie.initialize()
1804 def _real_extract(self, query):
1805 mobj = re.match(self._VALID_QUERY, query)
1807 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1810 prefix, query = query.split(':')
1812 query = query.encode('utf-8')
1814 self._download_n_results(query, 1)
1816 elif prefix == 'all':
1817 self._download_n_results(query, self._max_google_results)
1823 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1825 elif n > self._max_google_results:
1826 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1827 n = self._max_google_results
1828 self._download_n_results(query, n)
1830 except ValueError: # parsing prefix as integer fails
1831 self._download_n_results(query, 1)
1834 def _download_n_results(self, query, n):
1835 """Downloads a specified number of results for a query"""
1838 already_seen = set()
1842 self.report_download_page(query, pagenum)
1843 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1844 request = urllib2.Request(result_url, None, std_headers)
1846 page = urllib2.urlopen(request).read()
1847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1851 # Extract video identifiers
1852 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853 video_id = mobj.group(1)
1854 if video_id not in already_seen:
1855 video_ids.append(video_id)
1856 already_seen.add(video_id)
1857 if len(video_ids) == n:
1858 # Specified n videos reached
1859 for id in video_ids:
1860 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1863 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1864 for id in video_ids:
1865 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1868 pagenum = pagenum + 1
1870 class YahooSearchIE(InfoExtractor):
1871 """Information Extractor for Yahoo! Video search queries."""
1872 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1873 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1874 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1875 _MORE_PAGES_INDICATOR = r'\s*Next'
1877 _max_yahoo_results = 1000
1879 def __init__(self, yahoo_ie, downloader=None):
1880 InfoExtractor.__init__(self, downloader)
1881 self._yahoo_ie = yahoo_ie
1885 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1887 def report_download_page(self, query, pagenum):
1888 """Report attempt to download playlist page with given number."""
1889 query = query.decode(preferredencoding())
1890 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1892 def _real_initialize(self):
1893 self._yahoo_ie.initialize()
1895 def _real_extract(self, query):
1896 mobj = re.match(self._VALID_QUERY, query)
1898 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1901 prefix, query = query.split(':')
1903 query = query.encode('utf-8')
1905 self._download_n_results(query, 1)
1907 elif prefix == 'all':
1908 self._download_n_results(query, self._max_yahoo_results)
1914 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1916 elif n > self._max_yahoo_results:
1917 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1918 n = self._max_yahoo_results
1919 self._download_n_results(query, n)
1921 except ValueError: # parsing prefix as integer fails
1922 self._download_n_results(query, 1)
1925 def _download_n_results(self, query, n):
1926 """Downloads a specified number of results for a query"""
1929 already_seen = set()
1933 self.report_download_page(query, pagenum)
1934 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1935 request = urllib2.Request(result_url, None, std_headers)
1937 page = urllib2.urlopen(request).read()
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1942 # Extract video identifiers
1943 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944 video_id = mobj.group(1)
1945 if video_id not in already_seen:
1946 video_ids.append(video_id)
1947 already_seen.add(video_id)
1948 if len(video_ids) == n:
1949 # Specified n videos reached
1950 for id in video_ids:
1951 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1954 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955 for id in video_ids:
1956 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1959 pagenum = pagenum + 1
1961 class YoutubePlaylistIE(InfoExtractor):
1962 """Information Extractor for YouTube playlists."""
1964 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1965 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1966 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1967 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1970 def __init__(self, youtube_ie, downloader=None):
1971 InfoExtractor.__init__(self, downloader)
1972 self._youtube_ie = youtube_ie
1976 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1978 def report_download_page(self, playlist_id, pagenum):
1979 """Report attempt to download playlist page with given number."""
1980 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1982 def _real_initialize(self):
1983 self._youtube_ie.initialize()
1985 def _real_extract(self, url):
1986 # Extract playlist id
1987 mobj = re.match(self._VALID_URL, url)
1989 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1992 # Download playlist pages
1993 playlist_id = mobj.group(1)
1998 self.report_download_page(playlist_id, pagenum)
1999 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2001 page = urllib2.urlopen(request).read()
2002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2006 # Extract video identifiers
2008 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2009 if mobj.group(1) not in ids_in_page:
2010 ids_in_page.append(mobj.group(1))
2011 video_ids.extend(ids_in_page)
2013 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2015 pagenum = pagenum + 1
2017 playliststart = self._downloader.params.get('playliststart', 1) - 1
2018 playlistend = self._downloader.params.get('playlistend', -1)
2019 video_ids = video_ids[playliststart:playlistend]
2021 for id in video_ids:
2022 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2025 class YoutubeUserIE(InfoExtractor):
2026 """Information Extractor for YouTube users."""
2028 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2029 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2030 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2033 def __init__(self, youtube_ie, downloader=None):
2034 InfoExtractor.__init__(self, downloader)
2035 self._youtube_ie = youtube_ie
2039 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2041 def report_download_page(self, username):
2042 """Report attempt to download user page."""
2043 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2045 def _real_initialize(self):
2046 self._youtube_ie.initialize()
2048 def _real_extract(self, url):
2050 mobj = re.match(self._VALID_URL, url)
2052 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2055 # Download user page
2056 username = mobj.group(1)
2060 self.report_download_page(username)
2061 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2063 page = urllib2.urlopen(request).read()
2064 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2065 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2068 # Extract video identifiers
2071 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2072 if mobj.group(1) not in ids_in_page:
2073 ids_in_page.append(mobj.group(1))
2074 video_ids.extend(ids_in_page)
2076 playliststart = self._downloader.params.get('playliststart', 1) - 1
2077 playlistend = self._downloader.params.get('playlistend', -1)
2078 video_ids = video_ids[playliststart:playlistend]
2080 for id in video_ids:
2081 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2084 class DepositFilesIE(InfoExtractor):
2085 """Information extractor for depositfiles.com"""
2087 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2089 def __init__(self, downloader=None):
2090 InfoExtractor.__init__(self, downloader)
2094 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2096 def report_download_webpage(self, file_id):
2097 """Report webpage download."""
2098 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2100 def report_extraction(self, file_id):
2101 """Report information extraction."""
2102 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2104 def _real_initialize(self):
2107 def _real_extract(self, url):
2108 # At this point we have a new file
2109 self._downloader.increment_downloads()
2111 file_id = url.split('/')[-1]
2112 # Rebuild url in english locale
2113 url = 'http://depositfiles.com/en/files/' + file_id
2115 # Retrieve file webpage with 'Free download' button pressed
2116 free_download_indication = { 'gateway_result' : '1' }
2117 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2119 self.report_download_webpage(file_id)
2120 webpage = urllib2.urlopen(request).read()
2121 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2122 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2125 # Search for the real file URL
2126 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2127 if (mobj is None) or (mobj.group(1) is None):
2128 # Try to figure out reason of the error.
2129 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2130 if (mobj is not None) and (mobj.group(1) is not None):
2131 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2132 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2134 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2137 file_url = mobj.group(1)
2138 file_extension = os.path.splitext(file_url)[1][1:]
2140 # Search for file title
2141 mobj = re.search(r'<b title="(.*?)">', webpage)
2143 self._downloader.trouble(u'ERROR: unable to extract title')
2145 file_title = mobj.group(1).decode('utf-8')
2148 # Process file information
2149 self._downloader.process_info({
2150 'id': file_id.decode('utf-8'),
2151 'url': file_url.decode('utf-8'),
2153 'upload_date': u'NA',
2154 'title': file_title,
2155 'stitle': file_title,
2156 'ext': file_extension.decode('utf-8'),
2160 except UnavailableVideoError, err:
2161 self._downloader.trouble(u'ERROR: unable to download file')
2163 class PostProcessor(object):
2164 """Post Processor class.
2166 PostProcessor objects can be added to downloaders with their
2167 add_post_processor() method. When the downloader has finished a
2168 successful download, it will take its internal chain of PostProcessors
2169 and start calling the run() method on each one of them, first with
2170 an initial argument and then with the returned value of the previous
2173 The chain will be stopped if one of them ever returns None or the end
2174 of the chain is reached.
2176 PostProcessor objects follow a "mutual registration" process similar
2177 to InfoExtractor objects.
2182 def __init__(self, downloader=None):
2183 self._downloader = downloader
2185 def set_downloader(self, downloader):
2186 """Sets the downloader for this PP."""
2187 self._downloader = downloader
2189 def run(self, information):
2190 """Run the PostProcessor.
2192 The "information" argument is a dictionary like the ones
2193 composed by InfoExtractors. The only difference is that this
2194 one has an extra field called "filepath" that points to the
2197 When this method returns None, the postprocessing chain is
2198 stopped. However, this method may return an information
2199 dictionary that will be passed to the next postprocessing
2200 object in the chain. It can be the one it received after
2201 changing some fields.
2203 In addition, this method may raise a PostProcessingError
2204 exception that will be taken into account by the downloader
2207 return information # by default, do nothing
2209 ### MAIN PROGRAM ###
2210 if __name__ == '__main__':
2212 # Modules needed only when running the main program
2216 # Function to update the program file with the latest version from bitbucket.org
2217 def update_self(downloader, filename):
2218 # Note: downloader only used for options
2219 if not os.access (filename, os.W_OK):
2220 sys.exit('ERROR: no write permissions on %s' % filename)
2222 downloader.to_screen('Updating to latest stable version...')
2223 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2224 latest_version = urllib.urlopen(latest_url).read().strip()
2225 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2226 newcontent = urllib.urlopen(prog_url).read()
2227 stream = open(filename, 'w')
2228 stream.write(newcontent)
2230 downloader.to_screen('Updated to version %s' % latest_version)
2232 # Parse command line
2233 parser = optparse.OptionParser(
2234 usage='Usage: %prog [options] url...',
2235 version='2010.11.19',
2236 conflict_handler='resolve',
2239 parser.add_option('-h', '--help',
2240 action='help', help='print this help text and exit')
2241 parser.add_option('-v', '--version',
2242 action='version', help='print program version and exit')
2243 parser.add_option('-U', '--update',
2244 action='store_true', dest='update_self', help='update this program to latest stable version')
2245 parser.add_option('-i', '--ignore-errors',
2246 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2247 parser.add_option('-r', '--rate-limit',
2248 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2249 parser.add_option('-R', '--retries',
2250 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2251 parser.add_option('--playlist-start',
2252 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2253 parser.add_option('--playlist-end',
2254 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2256 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2257 authentication.add_option('-u', '--username',
2258 dest='username', metavar='USERNAME', help='account username')
2259 authentication.add_option('-p', '--password',
2260 dest='password', metavar='PASSWORD', help='account password')
2261 authentication.add_option('-n', '--netrc',
2262 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2263 parser.add_option_group(authentication)
2265 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2266 video_format.add_option('-f', '--format',
2267 action='store', dest='format', metavar='FORMAT', help='video format code')
2268 video_format.add_option('-m', '--mobile-version',
2269 action='store_const', dest='format', help='alias for -f 17', const='17')
2270 video_format.add_option('--all-formats',
2271 action='store_const', dest='format', help='download all available video formats', const='-1')
2272 video_format.add_option('--max-quality',
2273 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2274 video_format.add_option('-b', '--best-quality',
2275 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2276 parser.add_option_group(video_format)
2278 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2279 verbosity.add_option('-q', '--quiet',
2280 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2281 verbosity.add_option('-s', '--simulate',
2282 action='store_true', dest='simulate', help='do not download video', default=False)
2283 verbosity.add_option('-g', '--get-url',
2284 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2285 verbosity.add_option('-e', '--get-title',
2286 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2287 verbosity.add_option('--get-thumbnail',
2288 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2289 verbosity.add_option('--get-description',
2290 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2291 verbosity.add_option('--no-progress',
2292 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2293 parser.add_option_group(verbosity)
2295 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2296 filesystem.add_option('-t', '--title',
2297 action='store_true', dest='usetitle', help='use title in file name', default=False)
2298 filesystem.add_option('-l', '--literal',
2299 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2300 filesystem.add_option('-A', '--auto-number',
2301 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2302 filesystem.add_option('-o', '--output',
2303 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2304 filesystem.add_option('-a', '--batch-file',
2305 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2306 filesystem.add_option('-w', '--no-overwrites',
2307 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2308 filesystem.add_option('-c', '--continue',
2309 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2310 filesystem.add_option('--cookies',
2311 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2312 parser.add_option_group(filesystem)
2314 (opts, args) = parser.parse_args()
2316 # Open appropriate CookieJar
2317 if opts.cookiefile is None:
2318 jar = cookielib.CookieJar()
2321 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2322 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2324 except (IOError, OSError), err:
2325 sys.exit(u'ERROR: unable to open cookie file')
2327 # General configuration
2328 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2329 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2330 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2331 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2333 # Batch file verification
2335 if opts.batchfile is not None:
2337 if opts.batchfile == '-':
2340 batchfd = open(opts.batchfile, 'r')
2341 batchurls = batchfd.readlines()
2342 batchurls = [x.strip() for x in batchurls]
2343 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2345 sys.exit(u'ERROR: batch file could not be read')
2346 all_urls = batchurls + args
2348 # Conflicting, missing and erroneous options
2349 if opts.bestquality:
2350 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2351 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2352 parser.error(u'using .netrc conflicts with giving username/password')
2353 if opts.password is not None and opts.username is None:
2354 parser.error(u'account username missing')
2355 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2356 parser.error(u'using output template conflicts with using title, literal title or auto number')
2357 if opts.usetitle and opts.useliteral:
2358 parser.error(u'using title conflicts with using literal title')
2359 if opts.username is not None and opts.password is None:
2360 opts.password = getpass.getpass(u'Type account password and press return:')
2361 if opts.ratelimit is not None:
2362 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2363 if numeric_limit is None:
2364 parser.error(u'invalid rate limit specified')
2365 opts.ratelimit = numeric_limit
2366 if opts.retries is not None:
2368 opts.retries = long(opts.retries)
2369 except (TypeError, ValueError), err:
2370 parser.error(u'invalid retry count specified')
2372 opts.playliststart = long(opts.playliststart)
2373 if opts.playliststart <= 0:
2375 except (TypeError, ValueError), err:
2376 parser.error(u'invalid playlist start number specified')
2378 opts.playlistend = long(opts.playlistend)
2379 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2381 except (TypeError, ValueError), err:
2382 parser.error(u'invalid playlist end number specified')
2384 # Information extractors
2385 youtube_ie = YoutubeIE()
2386 metacafe_ie = MetacafeIE(youtube_ie)
2387 dailymotion_ie = DailymotionIE()
2388 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2389 youtube_user_ie = YoutubeUserIE(youtube_ie)
2390 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2391 google_ie = GoogleIE()
2392 google_search_ie = GoogleSearchIE(google_ie)
2393 photobucket_ie = PhotobucketIE()
2394 yahoo_ie = YahooIE()
2395 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2396 deposit_files_ie = DepositFilesIE()
2397 generic_ie = GenericIE()
2400 fd = FileDownloader({
2401 'usenetrc': opts.usenetrc,
2402 'username': opts.username,
2403 'password': opts.password,
2404 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2405 'forceurl': opts.geturl,
2406 'forcetitle': opts.gettitle,
2407 'forcethumbnail': opts.getthumbnail,
2408 'forcedescription': opts.getdescription,
2409 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2410 'format': opts.format,
2411 'format_limit': opts.format_limit,
2412 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2413 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2414 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2415 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2416 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2417 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2418 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2419 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2420 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2421 or u'%(id)s.%(ext)s'),
2422 'ignoreerrors': opts.ignoreerrors,
2423 'ratelimit': opts.ratelimit,
2424 'nooverwrites': opts.nooverwrites,
2425 'retries': opts.retries,
2426 'continuedl': opts.continue_dl,
2427 'noprogress': opts.noprogress,
2428 'playliststart': opts.playliststart,
2429 'playlistend': opts.playlistend,
2430 'logtostderr': opts.outtmpl == '-',
2432 fd.add_info_extractor(youtube_search_ie)
2433 fd.add_info_extractor(youtube_pl_ie)
2434 fd.add_info_extractor(youtube_user_ie)
2435 fd.add_info_extractor(metacafe_ie)
2436 fd.add_info_extractor(dailymotion_ie)
2437 fd.add_info_extractor(youtube_ie)
2438 fd.add_info_extractor(google_ie)
2439 fd.add_info_extractor(google_search_ie)
2440 fd.add_info_extractor(photobucket_ie)
2441 fd.add_info_extractor(yahoo_ie)
2442 fd.add_info_extractor(yahoo_search_ie)
2443 fd.add_info_extractor(deposit_files_ie)
2445 # This must come last since it's the
2446 # fallback if none of the others work
2447 fd.add_info_extractor(generic_ie)
2450 if opts.update_self:
2451 update_self(fd, sys.argv[0])
2454 if len(all_urls) < 1:
2455 if not opts.update_self:
2456 parser.error(u'you must provide at least one URL')
2459 retcode = fd.download(all_urls)
2461 # Dump cookie jar if requested
2462 if opts.cookiefile is not None:
2465 except (IOError, OSError), err:
2466 sys.exit(u'ERROR: unable to save cookie jar')
2470 except DownloadError:
2472 except SameFileError:
2473 sys.exit(u'ERROR: fixed output name but more than one file to download')
2474 except KeyboardInterrupt:
2475 sys.exit(u'\nERROR: Interrupted by user')