2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
25 # parse_qs was moved from the cgi module to the urlparse module recently.
27 from urlparse import parse_qs
29 from cgi import parse_qs
32 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
33 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
34 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35 'Accept-Language': 'en-us,en;q=0.5',
38 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40 def preferredencoding():
41 """Get preferred encoding.
43 Returns the best encoding scheme for the system, based on
44 locale.getpreferredencoding() and some further tweaks.
46 def yield_preferredencoding():
48 pref = locale.getpreferredencoding()
54 return yield_preferredencoding().next()
56 def htmlentity_transform(matchobj):
57 """Transforms an HTML entity to a Unicode character.
59 This function receives a match object and is intended to be used with
60 the re.sub() function.
62 entity = matchobj.group(1)
64 # Known non-numeric HTML entity
65 if entity in htmlentitydefs.name2codepoint:
66 return unichr(htmlentitydefs.name2codepoint[entity])
69 mobj = re.match(ur'(?u)#(x?\d+)', entity)
71 numstr = mobj.group(1)
72 if numstr.startswith(u'x'):
74 numstr = u'0%s' % numstr
77 return unichr(long(numstr, base))
79 # Unknown entity in name, return its literal representation
80 return (u'&%s;' % entity)
82 def sanitize_title(utitle):
83 """Sanitizes a video title so it could be used as part of a filename."""
84 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
85 return utitle.replace(unicode(os.sep), u'%')
87 def sanitize_open(filename, open_mode):
88 """Try to open the given filename, and slightly tweak it if this fails.
90 Attempts to open the given filename. If this fails, it tries to change
91 the filename slightly, step by step, until it's either able to open it
92 or it fails and raises a final exception, like the standard open()
95 It returns the tuple (stream, definitive_file_name).
99 if sys.platform == 'win32':
101 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
102 return (sys.stdout, filename)
103 stream = open(filename, open_mode)
104 return (stream, filename)
105 except (IOError, OSError), err:
106 # In case of error, try to remove win32 forbidden chars
107 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109 # An exception here should be caught in the caller
110 stream = open(filename, open_mode)
111 return (stream, filename)
113 class DownloadError(Exception):
114 """Download Error exception.
116 This exception may be thrown by FileDownloader objects if they are not
117 configured to continue on errors. They will contain the appropriate
122 class SameFileError(Exception):
123 """Same File exception.
125 This exception will be thrown by FileDownloader objects if they detect
126 multiple files would have to be downloaded to the same file on disk.
130 class PostProcessingError(Exception):
131 """Post Processing exception.
133 This exception may be raised by PostProcessor's .run() method to
134 indicate an error in the postprocessing task.
138 class UnavailableVideoError(Exception):
139 """Unavailable Format exception.
141 This exception will be thrown when a video is requested
142 in a format that is not available for that video.
146 class ContentTooShortError(Exception):
147 """Content Too Short exception.
149 This exception may be raised by FileDownloader objects when a file they
150 download is too small for what the server announced first, indicating
151 the connection was probably interrupted.
157 def __init__(self, downloaded, expected):
158 self.downloaded = downloaded
159 self.expected = expected
161 class FileDownloader(object):
162 """File Downloader class.
164 File downloader objects are the ones responsible of downloading the
165 actual video file and writing it to disk if the user has requested
166 it, among some other tasks. In most cases there should be one per
167 program. As, given a video URL, the downloader doesn't know how to
168 extract all the needed information, task that InfoExtractors do, it
169 has to pass the URL to one of them.
171 For this, file downloader objects have a method that allows
172 InfoExtractors to be registered in a given order. When it is passed
173 a URL, the file downloader handles it to the first InfoExtractor it
174 finds that reports being able to handle it. The InfoExtractor extracts
175 all the information about the video or videos the URL refers to, and
176 asks the FileDownloader to process the video information, possibly
177 downloading the video.
179 File downloaders accept a lot of parameters. In order not to saturate
180 the object constructor with arguments, it receives a dictionary of
181 options instead. These options are available through the params
182 attribute for the InfoExtractors to use. The FileDownloader also
183 registers itself as the downloader in charge for the InfoExtractors
184 that are added to it, so this is a "mutual registration".
188 username: Username for authentication purposes.
189 password: Password for authentication purposes.
190 usenetrc: Use netrc for authentication instead.
191 quiet: Do not print messages to stdout.
192 forceurl: Force printing final URL.
193 forcetitle: Force printing title.
194 forcethumbnail: Force printing thumbnail URL.
195 forcedescription: Force printing description.
196 simulate: Do not download the video files.
197 format: Video format code.
198 format_limit: Highest quality format to try.
199 outtmpl: Template for output names.
200 ignoreerrors: Do not stop on download errors.
201 ratelimit: Download speed limit, in bytes/sec.
202 nooverwrites: Prevent overwriting files.
203 retries: Number of times to retry for HTTP error 5xx
204 continuedl: Try to continue downloads if possible.
205 noprogress: Do not print the progress bar.
206 playliststart: Playlist item to start at.
207 playlistend: Playlist item to end at.
208 logtostderr: Log messages to stderr instead of stdout.
214 _download_retcode = None
215 _num_downloads = None
218 def __init__(self, params):
219 """Create a FileDownloader object with the given options."""
222 self._download_retcode = 0
223 self._num_downloads = 0
224 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
228 def pmkdir(filename):
229 """Create directory components in filename. Similar to Unix "mkdir -p"."""
230 components = filename.split(os.sep)
231 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
232 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
233 for dir in aggregate:
234 if not os.path.exists(dir):
238 def temp_name(filename):
239 """Returns a temporary filename for the given filename."""
240 return filename + '.part'
243 def format_bytes(bytes):
246 if type(bytes) is str:
251 exponent = long(math.log(bytes, 1024.0))
252 suffix = 'bkMGTPEZY'[exponent]
253 converted = float(bytes) / float(1024**exponent)
254 return '%.2f%s' % (converted, suffix)
257 def calc_percent(byte_counter, data_len):
260 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
263 def calc_eta(start, now, total, current):
267 if current == 0 or dif < 0.001: # One millisecond
269 rate = float(current) / dif
270 eta = long((float(total) - float(current)) / rate)
271 (eta_mins, eta_secs) = divmod(eta, 60)
274 return '%02d:%02d' % (eta_mins, eta_secs)
277 def calc_speed(start, now, bytes):
279 if bytes == 0 or dif < 0.001: # One millisecond
280 return '%10s' % '---b/s'
281 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
284 def best_block_size(elapsed_time, bytes):
285 new_min = max(bytes / 2.0, 1.0)
286 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
287 if elapsed_time < 0.001:
289 rate = bytes / elapsed_time
297 def parse_bytes(bytestr):
298 """Parse a string indicating a byte quantity into a long integer."""
299 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
302 number = float(matchobj.group(1))
303 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
304 return long(round(number * multiplier))
306 def add_info_extractor(self, ie):
307 """Add an InfoExtractor object to the end of the list."""
309 ie.set_downloader(self)
311 def add_post_processor(self, pp):
312 """Add a PostProcessor object to the end of the chain."""
314 pp.set_downloader(self)
316 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
317 """Print message to stdout if not in quiet mode."""
319 if not self.params.get('quiet', False):
320 terminator = [u'\n', u''][skip_eol]
321 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
322 self._screen_file.flush()
323 except (UnicodeEncodeError), err:
324 if not ignore_encoding_errors:
327 def to_stderr(self, message):
328 """Print message to stderr."""
329 print >>sys.stderr, message.encode(preferredencoding())
331 def fixed_template(self):
332 """Checks if the output template is fixed."""
333 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
335 def trouble(self, message=None):
336 """Determine action to take when a download problem appears.
338 Depending on if the downloader has been configured to ignore
339 download errors or not, this method may throw an exception or
340 not when errors are found, after printing the message.
342 if message is not None:
343 self.to_stderr(message)
344 if not self.params.get('ignoreerrors', False):
345 raise DownloadError(message)
346 self._download_retcode = 1
348 def slow_down(self, start_time, byte_counter):
349 """Sleep if the download speed is over the rate limit."""
350 rate_limit = self.params.get('ratelimit', None)
351 if rate_limit is None or byte_counter == 0:
354 elapsed = now - start_time
357 speed = float(byte_counter) / elapsed
358 if speed > rate_limit:
359 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
361 def try_rename(self, old_filename, new_filename):
363 os.rename(old_filename, new_filename)
364 except (IOError, OSError), err:
365 self.trouble(u'ERROR: unable to rename file')
367 def report_destination(self, filename):
368 """Report destination filename."""
369 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
371 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
372 """Report download progress."""
373 if self.params.get('noprogress', False):
375 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
376 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
378 def report_resuming_byte(self, resume_len):
379 """Report attempt to resume at given byte."""
380 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
382 def report_retry(self, count, retries):
383 """Report retry in case of HTTP error 5xx"""
384 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
386 def report_file_already_downloaded(self, file_name):
387 """Report file has already been fully downloaded."""
389 self.to_screen(u'[download] %s has already been downloaded' % file_name)
390 except (UnicodeEncodeError), err:
391 self.to_screen(u'[download] The file has already been downloaded')
393 def report_unable_to_resume(self):
394 """Report it was impossible to resume download."""
395 self.to_screen(u'[download] Unable to resume')
397 def report_finish(self):
398 """Report download finished."""
399 if self.params.get('noprogress', False):
400 self.to_screen(u'[download] Download completed')
404 def increment_downloads(self):
405 """Increment the ordinal that assigns a number to each file."""
406 self._num_downloads += 1
408 def process_info(self, info_dict):
409 """Process a single dictionary returned by an InfoExtractor."""
410 # Do nothing else if in simulate mode
411 if self.params.get('simulate', False):
413 if self.params.get('forcetitle', False):
414 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
415 if self.params.get('forceurl', False):
416 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
417 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
418 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
419 if self.params.get('forcedescription', False) and 'description' in info_dict:
420 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
425 template_dict = dict(info_dict)
426 template_dict['epoch'] = unicode(long(time.time()))
427 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
428 filename = self.params['outtmpl'] % template_dict
429 except (ValueError, KeyError), err:
430 self.trouble(u'ERROR: invalid system charset or erroneous output template')
432 if self.params.get('nooverwrites', False) and os.path.exists(filename):
433 self.to_stderr(u'WARNING: file exists and will be skipped')
437 self.pmkdir(filename)
438 except (OSError, IOError), err:
439 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
443 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
444 except (OSError, IOError), err:
445 raise UnavailableVideoError
446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
447 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
449 except (ContentTooShortError, ), err:
450 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
455 self.post_process(filename, info_dict)
456 except (PostProcessingError), err:
457 self.trouble(u'ERROR: postprocessing: %s' % str(err))
460 def download(self, url_list):
461 """Download a given list of URLs."""
462 if len(url_list) > 1 and self.fixed_template():
463 raise SameFileError(self.params['outtmpl'])
466 suitable_found = False
468 # Go to next InfoExtractor if not suitable
469 if not ie.suitable(url):
472 # Suitable InfoExtractor found
473 suitable_found = True
475 # Extract information from URL and process it
478 # Suitable InfoExtractor had been found; go to next URL
481 if not suitable_found:
482 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
484 return self._download_retcode
486 def post_process(self, filename, ie_info):
487 """Run the postprocessing chain on the given file."""
489 info['filepath'] = filename
495 def _download_with_rtmpdump(self, filename, url, player_url):
496 self.report_destination(filename)
497 tmpfilename = self.temp_name(filename)
499 # Check for rtmpdump first
501 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
502 except (OSError, IOError):
503 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
506 # Download using rtmpdump. rtmpdump returns exit code 2 when
507 # the connection was interrumpted and resuming appears to be
508 # possible. This is part of rtmpdump's normal usage, AFAIK.
509 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
510 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
511 while retval == 2 or retval == 1:
512 prevsize = os.path.getsize(tmpfilename)
513 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
514 time.sleep(5.0) # This seems to be needed
515 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
516 cursize = os.path.getsize(tmpfilename)
517 if prevsize == cursize and retval == 1:
520 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
521 self.try_rename(tmpfilename, filename)
524 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
527 def _do_download(self, filename, url, player_url):
528 # Check file already present
529 if self.params.get('continuedl', False) and os.path.isfile(filename):
530 self.report_file_already_downloaded(filename)
533 # Attempt to download using rtmpdump
534 if url.startswith('rtmp'):
535 return self._download_with_rtmpdump(filename, url, player_url)
537 tmpfilename = self.temp_name(filename)
540 basic_request = urllib2.Request(url, None, std_headers)
541 request = urllib2.Request(url, None, std_headers)
543 # Establish possible resume length
544 if os.path.isfile(tmpfilename):
545 resume_len = os.path.getsize(tmpfilename)
549 # Request parameters in case of being able to resume
550 if self.params.get('continuedl', False) and resume_len != 0:
551 self.report_resuming_byte(resume_len)
552 request.add_header('Range','bytes=%d-' % resume_len)
556 retries = self.params.get('retries', 0)
557 while count <= retries:
558 # Establish connection
560 data = urllib2.urlopen(request)
562 except (urllib2.HTTPError, ), err:
563 if (err.code < 500 or err.code >= 600) and err.code != 416:
564 # Unexpected HTTP error
566 elif err.code == 416:
567 # Unable to resume (requested range not satisfiable)
569 # Open the connection again without the range header
570 data = urllib2.urlopen(basic_request)
571 content_length = data.info()['Content-Length']
572 except (urllib2.HTTPError, ), err:
573 if err.code < 500 or err.code >= 600:
576 # Examine the reported length
577 if (content_length is not None and
578 (resume_len - 100 < long(content_length) < resume_len + 100)):
579 # The file had already been fully downloaded.
580 # Explanation to the above condition: in issue #175 it was revealed that
581 # YouTube sometimes adds or removes a few bytes from the end of the file,
582 # changing the file size slightly and causing problems for some users. So
583 # I decided to implement a suggested change and consider the file
584 # completely downloaded if the file size differs less than 100 bytes from
585 # the one in the hard drive.
586 self.report_file_already_downloaded(filename)
587 self.try_rename(tmpfilename, filename)
590 # The length does not match, we start the download over
591 self.report_unable_to_resume()
597 self.report_retry(count, retries)
600 self.trouble(u'ERROR: giving up after %s retries' % retries)
603 data_len = data.info().get('Content-length', None)
604 data_len_str = self.format_bytes(data_len)
611 data_block = data.read(block_size)
613 data_block_len = len(data_block)
614 if data_block_len == 0:
616 byte_counter += data_block_len
618 # Open file just in time
621 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
622 self.report_destination(filename)
623 except (OSError, IOError), err:
624 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
627 stream.write(data_block)
628 except (IOError, OSError), err:
629 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
631 block_size = self.best_block_size(after - before, data_block_len)
634 percent_str = self.calc_percent(byte_counter, data_len)
635 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
636 speed_str = self.calc_speed(start, time.time(), byte_counter)
637 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
640 self.slow_down(start, byte_counter)
644 if data_len is not None and str(byte_counter) != data_len:
645 raise ContentTooShortError(byte_counter, long(data_len))
646 self.try_rename(tmpfilename, filename)
649 class InfoExtractor(object):
650 """Information Extractor class.
652 Information extractors are the classes that, given a URL, extract
653 information from the video (or videos) the URL refers to. This
654 information includes the real video URL, the video title and simplified
655 title, author and others. The information is stored in a dictionary
656 which is then passed to the FileDownloader. The FileDownloader
657 processes this information possibly downloading the video to the file
658 system, among other possible outcomes. The dictionaries must include
659 the following fields:
661 id: Video identifier.
662 url: Final video URL.
663 uploader: Nickname of the video uploader.
664 title: Literal title.
665 stitle: Simplified title.
666 ext: Video filename extension.
667 format: Video format.
668 player_url: SWF Player URL (may be None).
670 The following fields are optional. Their primary purpose is to allow
671 youtube-dl to serve as the backend for a video search function, such
672 as the one in youtube2mp3. They are only used when their respective
673 forced printing functions are called:
675 thumbnail: Full URL to a video thumbnail image.
676 description: One-line video description.
678 Subclasses of this one should re-define the _real_initialize() and
679 _real_extract() methods, as well as the suitable() static method.
680 Probably, they should also be instantiated and added to the main
687 def __init__(self, downloader=None):
688 """Constructor. Receives an optional downloader."""
690 self.set_downloader(downloader)
694 """Receives a URL and returns True if suitable for this IE."""
697 def initialize(self):
698 """Initializes an instance (authentication, etc)."""
700 self._real_initialize()
703 def extract(self, url):
704 """Extracts URL information and returns it in list of dicts."""
706 return self._real_extract(url)
708 def set_downloader(self, downloader):
709 """Sets the downloader for this IE."""
710 self._downloader = downloader
712 def _real_initialize(self):
713 """Real initialization process. Redefine in subclasses."""
716 def _real_extract(self, url):
717 """Real extraction process. Redefine in subclasses."""
720 class YoutubeIE(InfoExtractor):
721 """Information extractor for youtube.com."""
723 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
724 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
725 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
726 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
727 _NETRC_MACHINE = 'youtube'
728 # Listed in order of quality
729 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
730 _video_extensions = {
736 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
743 return (re.match(YoutubeIE._VALID_URL, url) is not None)
745 def report_lang(self):
746 """Report attempt to set language."""
747 self._downloader.to_screen(u'[youtube] Setting language')
749 def report_login(self):
750 """Report attempt to log in."""
751 self._downloader.to_screen(u'[youtube] Logging in')
753 def report_age_confirmation(self):
754 """Report attempt to confirm age."""
755 self._downloader.to_screen(u'[youtube] Confirming age')
757 def report_video_webpage_download(self, video_id):
758 """Report attempt to download video webpage."""
759 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
761 def report_video_info_webpage_download(self, video_id):
762 """Report attempt to download video info webpage."""
763 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
765 def report_information_extraction(self, video_id):
766 """Report attempt to extract video information."""
767 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
769 def report_unavailable_format(self, video_id, format):
770 """Report extracted video URL."""
771 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
773 def report_rtmp_download(self):
774 """Indicate the download will use the RTMP protocol."""
775 self._downloader.to_screen(u'[youtube] RTMP download detected')
777 def _real_initialize(self):
778 if self._downloader is None:
783 downloader_params = self._downloader.params
785 # Attempt to use provided username and password or .netrc data
786 if downloader_params.get('username', None) is not None:
787 username = downloader_params['username']
788 password = downloader_params['password']
789 elif downloader_params.get('usenetrc', False):
791 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
796 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
797 except (IOError, netrc.NetrcParseError), err:
798 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
802 request = urllib2.Request(self._LANG_URL, None, std_headers)
805 urllib2.urlopen(request).read()
806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
810 # No authentication to be performed
816 'current_form': 'loginForm',
818 'action_login': 'Log In',
819 'username': username,
820 'password': password,
822 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
825 login_results = urllib2.urlopen(request).read()
826 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
827 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
829 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
830 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
836 'action_confirm': 'Confirm',
838 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
840 self.report_age_confirmation()
841 age_results = urllib2.urlopen(request).read()
842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
843 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
846 def _real_extract(self, url):
847 # Extract video id from URL
848 mobj = re.match(self._VALID_URL, url)
850 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
852 video_id = mobj.group(2)
855 self.report_video_webpage_download(video_id)
856 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
858 video_webpage = urllib2.urlopen(request).read()
859 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
863 # Attempt to extract SWF player URL
864 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
866 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
871 self.report_video_info_webpage_download(video_id)
872 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
873 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
874 % (video_id, el_type))
875 request = urllib2.Request(video_info_url, None, std_headers)
877 video_info_webpage = urllib2.urlopen(request).read()
878 video_info = parse_qs(video_info_webpage)
879 if 'token' in video_info:
881 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
882 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
884 if 'token' not in video_info:
885 if 'reason' in video_info:
886 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
888 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
891 # Start extracting information
892 self.report_information_extraction(video_id)
895 if 'author' not in video_info:
896 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
898 video_uploader = urllib.unquote_plus(video_info['author'][0])
901 if 'title' not in video_info:
902 self._downloader.trouble(u'ERROR: unable to extract video title')
904 video_title = urllib.unquote_plus(video_info['title'][0])
905 video_title = video_title.decode('utf-8')
906 video_title = sanitize_title(video_title)
909 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
910 simple_title = simple_title.strip(ur'_')
913 if 'thumbnail_url' not in video_info:
914 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
916 else: # don't panic if we can't find it
917 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
921 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
923 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
924 format_expressions = ['%d %B %Y', '%B %d %Y']
925 for expression in format_expressions:
927 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
932 video_description = 'No description available.'
933 if self._downloader.params.get('forcedescription', False):
934 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
936 video_description = mobj.group(1)
939 video_token = urllib.unquote_plus(video_info['token'][0])
941 # Decide which formats to download
942 requested_format = self._downloader.params.get('format', None)
943 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
945 if 'fmt_url_map' in video_info:
946 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
947 format_limit = self._downloader.params.get('format_limit', None)
948 if format_limit is not None and format_limit in self._available_formats:
949 format_list = self._available_formats[self._available_formats.index(format_limit):]
951 format_list = self._available_formats
952 existing_formats = [x for x in format_list if x in url_map]
953 if len(existing_formats) == 0:
954 self._downloader.trouble(u'ERROR: no known formats available for video')
956 if requested_format is None:
957 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
958 elif requested_format == '-1':
959 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
961 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
963 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
964 self.report_rtmp_download()
965 video_url_list = [(None, video_info['conn'][0])]
968 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
971 for format_param, video_real_url in video_url_list:
972 # At this point we have a new video
973 self._downloader.increment_downloads()
976 video_extension = self._video_extensions.get(format_param, 'flv')
978 # Find the video URL in fmt_url_map or conn paramters
980 # Process video information
981 self._downloader.process_info({
982 'id': video_id.decode('utf-8'),
983 'url': video_real_url.decode('utf-8'),
984 'uploader': video_uploader.decode('utf-8'),
985 'upload_date': upload_date,
986 'title': video_title,
987 'stitle': simple_title,
988 'ext': video_extension.decode('utf-8'),
989 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
990 'thumbnail': video_thumbnail.decode('utf-8'),
991 'description': video_description.decode('utf-8'),
992 'player_url': player_url,
994 except UnavailableVideoError, err:
995 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
998 class MetacafeIE(InfoExtractor):
999 """Information Extractor for metacafe.com."""
1001 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1002 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1003 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1006 def __init__(self, youtube_ie, downloader=None):
1007 InfoExtractor.__init__(self, downloader)
1008 self._youtube_ie = youtube_ie
1012 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1014 def report_disclaimer(self):
1015 """Report disclaimer retrieval."""
1016 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1018 def report_age_confirmation(self):
1019 """Report attempt to confirm age."""
1020 self._downloader.to_screen(u'[metacafe] Confirming age')
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1026 def report_extraction(self, video_id):
1027 """Report information extraction."""
1028 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1030 def _real_initialize(self):
1031 # Retrieve disclaimer
1032 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1034 self.report_disclaimer()
1035 disclaimer = urllib2.urlopen(request).read()
1036 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1043 'submit': "Continue - I'm over 18",
1045 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1047 self.report_age_confirmation()
1048 disclaimer = urllib2.urlopen(request).read()
1049 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1053 def _real_extract(self, url):
1054 # Extract id and simplified title from URL
1055 mobj = re.match(self._VALID_URL, url)
1057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1060 video_id = mobj.group(1)
1062 # Check if video comes from YouTube
1063 mobj2 = re.match(r'^yt-(.*)$', video_id)
1064 if mobj2 is not None:
1065 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1068 # At this point we have a new video
1069 self._downloader.increment_downloads()
1071 simple_title = mobj.group(2).decode('utf-8')
1073 # Retrieve video webpage to extract further information
1074 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1076 self.report_download_webpage(video_id)
1077 webpage = urllib2.urlopen(request).read()
1078 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1079 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1082 # Extract URL, uploader and title from webpage
1083 self.report_extraction(video_id)
1084 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1085 if mobj is not None:
1086 mediaURL = urllib.unquote(mobj.group(1))
1087 video_extension = mediaURL[-3:]
1089 # Extract gdaKey if available
1090 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1092 video_url = mediaURL
1094 gdaKey = mobj.group(1)
1095 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1097 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1099 self._downloader.trouble(u'ERROR: unable to extract media URL')
1101 vardict = parse_qs(mobj.group(1))
1102 if 'mediaData' not in vardict:
1103 self._downloader.trouble(u'ERROR: unable to extract media URL')
1105 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1107 self._downloader.trouble(u'ERROR: unable to extract media URL')
1109 mediaURL = mobj.group(1).replace('\\/', '/')
1110 video_extension = mediaURL[-3:]
1111 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1113 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1115 self._downloader.trouble(u'ERROR: unable to extract title')
1117 video_title = mobj.group(1).decode('utf-8')
1118 video_title = sanitize_title(video_title)
1120 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1122 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1124 video_uploader = mobj.group(1)
1127 # Process video information
1128 self._downloader.process_info({
1129 'id': video_id.decode('utf-8'),
1130 'url': video_url.decode('utf-8'),
1131 'uploader': video_uploader.decode('utf-8'),
1132 'upload_date': u'NA',
1133 'title': video_title,
1134 'stitle': simple_title,
1135 'ext': video_extension.decode('utf-8'),
1139 except UnavailableVideoError:
1140 self._downloader.trouble(u'ERROR: unable to download video')
1143 class DailymotionIE(InfoExtractor):
1144 """Information Extractor for Dailymotion"""
1146 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1148 def __init__(self, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1153 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1155 def report_download_webpage(self, video_id):
1156 """Report webpage download."""
1157 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1159 def report_extraction(self, video_id):
1160 """Report information extraction."""
1161 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1163 def _real_initialize(self):
1166 def _real_extract(self, url):
1167 # Extract id and simplified title from URL
1168 mobj = re.match(self._VALID_URL, url)
1170 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1173 # At this point we have a new video
1174 self._downloader.increment_downloads()
1175 video_id = mobj.group(1)
1177 simple_title = mobj.group(2).decode('utf-8')
1178 video_extension = 'flv'
1180 # Retrieve video webpage to extract further information
1181 request = urllib2.Request(url)
1183 self.report_download_webpage(video_id)
1184 webpage = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1189 # Extract URL, uploader and title from webpage
1190 self.report_extraction(video_id)
1191 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1193 self._downloader.trouble(u'ERROR: unable to extract media URL')
1195 mediaURL = urllib.unquote(mobj.group(1))
1197 # if needed add http://www.dailymotion.com/ if relative URL
1199 video_url = mediaURL
1201 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1202 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1206 video_title = mobj.group(1).decode('utf-8')
1207 video_title = sanitize_title(video_title)
1209 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1211 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1213 video_uploader = mobj.group(1)
1216 # Process video information
1217 self._downloader.process_info({
1218 'id': video_id.decode('utf-8'),
1219 'url': video_url.decode('utf-8'),
1220 'uploader': video_uploader.decode('utf-8'),
1221 'upload_date': u'NA',
1222 'title': video_title,
1223 'stitle': simple_title,
1224 'ext': video_extension.decode('utf-8'),
1228 except UnavailableVideoError:
1229 self._downloader.trouble(u'ERROR: unable to download video')
1231 class GoogleIE(InfoExtractor):
1232 """Information extractor for video.google.com."""
1234 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1236 def __init__(self, downloader=None):
1237 InfoExtractor.__init__(self, downloader)
1241 return (re.match(GoogleIE._VALID_URL, url) is not None)
1243 def report_download_webpage(self, video_id):
1244 """Report webpage download."""
1245 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1247 def report_extraction(self, video_id):
1248 """Report information extraction."""
1249 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1251 def _real_initialize(self):
1254 def _real_extract(self, url):
1255 # Extract id from URL
1256 mobj = re.match(self._VALID_URL, url)
1258 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1261 # At this point we have a new video
1262 self._downloader.increment_downloads()
1263 video_id = mobj.group(1)
1265 video_extension = 'mp4'
1267 # Retrieve video webpage to extract further information
1268 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1270 self.report_download_webpage(video_id)
1271 webpage = urllib2.urlopen(request).read()
1272 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1276 # Extract URL, uploader, and title from webpage
1277 self.report_extraction(video_id)
1278 mobj = re.search(r"download_url:'([^']+)'", webpage)
1280 video_extension = 'flv'
1281 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1283 self._downloader.trouble(u'ERROR: unable to extract media URL')
1285 mediaURL = urllib.unquote(mobj.group(1))
1286 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1287 mediaURL = mediaURL.replace('\\x26', '\x26')
1289 video_url = mediaURL
1291 mobj = re.search(r'<title>(.*)</title>', webpage)
1293 self._downloader.trouble(u'ERROR: unable to extract title')
1295 video_title = mobj.group(1).decode('utf-8')
1296 video_title = sanitize_title(video_title)
1297 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1299 # Extract video description
1300 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1302 self._downloader.trouble(u'ERROR: unable to extract video description')
1304 video_description = mobj.group(1).decode('utf-8')
1305 if not video_description:
1306 video_description = 'No description available.'
1308 # Extract video thumbnail
1309 if self._downloader.params.get('forcethumbnail', False):
1310 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1312 webpage = urllib2.urlopen(request).read()
1313 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1314 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1316 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1320 video_thumbnail = mobj.group(1)
1321 else: # we need something to pass to process_info
1322 video_thumbnail = ''
1326 # Process video information
1327 self._downloader.process_info({
1328 'id': video_id.decode('utf-8'),
1329 'url': video_url.decode('utf-8'),
1331 'upload_date': u'NA',
1332 'title': video_title,
1333 'stitle': simple_title,
1334 'ext': video_extension.decode('utf-8'),
1338 except UnavailableVideoError:
1339 self._downloader.trouble(u'ERROR: unable to download video')
1342 class PhotobucketIE(InfoExtractor):
1343 """Information extractor for photobucket.com."""
1345 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1347 def __init__(self, downloader=None):
1348 InfoExtractor.__init__(self, downloader)
1352 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1354 def report_download_webpage(self, video_id):
1355 """Report webpage download."""
1356 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1362 def _real_initialize(self):
1365 def _real_extract(self, url):
1366 # Extract id from URL
1367 mobj = re.match(self._VALID_URL, url)
1369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1372 # At this point we have a new video
1373 self._downloader.increment_downloads()
1374 video_id = mobj.group(1)
1376 video_extension = 'flv'
1378 # Retrieve video webpage to extract further information
1379 request = urllib2.Request(url)
1381 self.report_download_webpage(video_id)
1382 webpage = urllib2.urlopen(request).read()
1383 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1387 # Extract URL, uploader, and title from webpage
1388 self.report_extraction(video_id)
1389 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1391 self._downloader.trouble(u'ERROR: unable to extract media URL')
1393 mediaURL = urllib.unquote(mobj.group(1))
1395 video_url = mediaURL
1397 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1399 self._downloader.trouble(u'ERROR: unable to extract title')
1401 video_title = mobj.group(1).decode('utf-8')
1402 video_title = sanitize_title(video_title)
1403 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1405 video_uploader = mobj.group(2).decode('utf-8')
1408 # Process video information
1409 self._downloader.process_info({
1410 'id': video_id.decode('utf-8'),
1411 'url': video_url.decode('utf-8'),
1412 'uploader': video_uploader,
1413 'upload_date': u'NA',
1414 'title': video_title,
1415 'stitle': simple_title,
1416 'ext': video_extension.decode('utf-8'),
1420 except UnavailableVideoError:
1421 self._downloader.trouble(u'ERROR: unable to download video')
1424 class YahooIE(InfoExtractor):
1425 """Information extractor for video.yahoo.com."""
1427 # _VALID_URL matches all Yahoo! Video URLs
1428 # _VPAGE_URL matches only the extractable '/watch/' URLs
1429 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1430 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1432 def __init__(self, downloader=None):
1433 InfoExtractor.__init__(self, downloader)
1437 return (re.match(YahooIE._VALID_URL, url) is not None)
1439 def report_download_webpage(self, video_id):
1440 """Report webpage download."""
1441 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1443 def report_extraction(self, video_id):
1444 """Report information extraction."""
1445 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1447 def _real_initialize(self):
1450 def _real_extract(self, url, new_video=True):
1451 # Extract ID from URL
1452 mobj = re.match(self._VALID_URL, url)
1454 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1457 # At this point we have a new video
1458 self._downloader.increment_downloads()
1459 video_id = mobj.group(2)
1460 video_extension = 'flv'
1462 # Rewrite valid but non-extractable URLs as
1463 # extractable English language /watch/ URLs
1464 if re.match(self._VPAGE_URL, url) is None:
1465 request = urllib2.Request(url)
1467 webpage = urllib2.urlopen(request).read()
1468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1472 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1474 self._downloader.trouble(u'ERROR: Unable to extract id field')
1476 yahoo_id = mobj.group(1)
1478 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1480 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1482 yahoo_vid = mobj.group(1)
1484 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1485 return self._real_extract(url, new_video=False)
1487 # Retrieve video webpage to extract further information
1488 request = urllib2.Request(url)
1490 self.report_download_webpage(video_id)
1491 webpage = urllib2.urlopen(request).read()
1492 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1496 # Extract uploader and title from webpage
1497 self.report_extraction(video_id)
1498 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1500 self._downloader.trouble(u'ERROR: unable to extract video title')
1502 video_title = mobj.group(1).decode('utf-8')
1503 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1505 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1507 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1509 video_uploader = mobj.group(1).decode('utf-8')
1511 # Extract video thumbnail
1512 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1514 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1516 video_thumbnail = mobj.group(1).decode('utf-8')
1518 # Extract video description
1519 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1521 self._downloader.trouble(u'ERROR: unable to extract video description')
1523 video_description = mobj.group(1).decode('utf-8')
1524 if not video_description: video_description = 'No description available.'
1526 # Extract video height and width
1527 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1529 self._downloader.trouble(u'ERROR: unable to extract video height')
1531 yv_video_height = mobj.group(1)
1533 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1535 self._downloader.trouble(u'ERROR: unable to extract video width')
1537 yv_video_width = mobj.group(1)
1539 # Retrieve video playlist to extract media URL
1540 # I'm not completely sure what all these options are, but we
1541 # seem to need most of them, otherwise the server sends a 401.
1542 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1543 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1544 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1545 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1546 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1548 self.report_download_webpage(video_id)
1549 webpage = urllib2.urlopen(request).read()
1550 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1554 # Extract media URL from playlist XML
1555 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1557 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1559 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1560 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1563 # Process video information
1564 self._downloader.process_info({
1565 'id': video_id.decode('utf-8'),
1567 'uploader': video_uploader,
1568 'upload_date': u'NA',
1569 'title': video_title,
1570 'stitle': simple_title,
1571 'ext': video_extension.decode('utf-8'),
1572 'thumbnail': video_thumbnail.decode('utf-8'),
1573 'description': video_description,
1574 'thumbnail': video_thumbnail,
1575 'description': video_description,
1578 except UnavailableVideoError:
1579 self._downloader.trouble(u'ERROR: unable to download video')
1582 class GenericIE(InfoExtractor):
1583 """Generic last-resort information extractor."""
1585 def __init__(self, downloader=None):
1586 InfoExtractor.__init__(self, downloader)
1592 def report_download_webpage(self, video_id):
1593 """Report webpage download."""
1594 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1595 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1597 def report_extraction(self, video_id):
1598 """Report information extraction."""
1599 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1601 def _real_initialize(self):
1604 def _real_extract(self, url):
1605 # At this point we have a new video
1606 self._downloader.increment_downloads()
1608 video_id = url.split('/')[-1]
1609 request = urllib2.Request(url)
1611 self.report_download_webpage(video_id)
1612 webpage = urllib2.urlopen(request).read()
1613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1616 except ValueError, err:
1617 # since this is the last-resort InfoExtractor, if
1618 # this error is thrown, it'll be thrown here
1619 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1622 self.report_extraction(video_id)
1623 # Start with something easy: JW Player in SWFObject
1624 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1626 # Broaden the search a little bit
1627 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1629 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1632 # It's possible that one of the regexes
1633 # matched, but returned an empty group:
1634 if mobj.group(1) is None:
1635 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1638 video_url = urllib.unquote(mobj.group(1))
1639 video_id = os.path.basename(video_url)
1641 # here's a fun little line of code for you:
1642 video_extension = os.path.splitext(video_id)[1][1:]
1643 video_id = os.path.splitext(video_id)[0]
1645 # it's tempting to parse this further, but you would
1646 # have to take into account all the variations like
1647 # Video Title - Site Name
1648 # Site Name | Video Title
1649 # Video Title - Tagline | Site Name
1650 # and so on and so forth; it's just not practical
1651 mobj = re.search(r'<title>(.*)</title>', webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract title')
1655 video_title = mobj.group(1).decode('utf-8')
1656 video_title = sanitize_title(video_title)
1657 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1659 # video uploader is domain name
1660 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1662 self._downloader.trouble(u'ERROR: unable to extract title')
1664 video_uploader = mobj.group(1).decode('utf-8')
1667 # Process video information
1668 self._downloader.process_info({
1669 'id': video_id.decode('utf-8'),
1670 'url': video_url.decode('utf-8'),
1671 'uploader': video_uploader,
1672 'upload_date': u'NA',
1673 'title': video_title,
1674 'stitle': simple_title,
1675 'ext': video_extension.decode('utf-8'),
1679 except UnavailableVideoError, err:
1680 self._downloader.trouble(u'ERROR: unable to download video')
1683 class YoutubeSearchIE(InfoExtractor):
1684 """Information Extractor for YouTube search queries."""
1685 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1686 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1687 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1688 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1690 _max_youtube_results = 1000
1692 def __init__(self, youtube_ie, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1694 self._youtube_ie = youtube_ie
1698 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1700 def report_download_page(self, query, pagenum):
1701 """Report attempt to download playlist page with given number."""
1702 query = query.decode(preferredencoding())
1703 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1705 def _real_initialize(self):
1706 self._youtube_ie.initialize()
1708 def _real_extract(self, query):
1709 mobj = re.match(self._VALID_QUERY, query)
1711 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1714 prefix, query = query.split(':')
1716 query = query.encode('utf-8')
1718 self._download_n_results(query, 1)
1720 elif prefix == 'all':
1721 self._download_n_results(query, self._max_youtube_results)
1727 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1729 elif n > self._max_youtube_results:
1730 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1731 n = self._max_youtube_results
1732 self._download_n_results(query, n)
1734 except ValueError: # parsing prefix as integer fails
1735 self._download_n_results(query, 1)
1738 def _download_n_results(self, query, n):
1739 """Downloads a specified number of results for a query"""
1742 already_seen = set()
1746 self.report_download_page(query, pagenum)
1747 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1748 request = urllib2.Request(result_url, None, std_headers)
1750 page = urllib2.urlopen(request).read()
1751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1755 # Extract video identifiers
1756 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1757 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1758 if video_id not in already_seen:
1759 video_ids.append(video_id)
1760 already_seen.add(video_id)
1761 if len(video_ids) == n:
1762 # Specified n videos reached
1763 for id in video_ids:
1764 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1767 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1768 for id in video_ids:
1769 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1772 pagenum = pagenum + 1
1774 class GoogleSearchIE(InfoExtractor):
1775 """Information Extractor for Google Video search queries."""
1776 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1777 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1778 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1779 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1781 _max_google_results = 1000
1783 def __init__(self, google_ie, downloader=None):
1784 InfoExtractor.__init__(self, downloader)
1785 self._google_ie = google_ie
1789 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1791 def report_download_page(self, query, pagenum):
1792 """Report attempt to download playlist page with given number."""
1793 query = query.decode(preferredencoding())
1794 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1796 def _real_initialize(self):
1797 self._google_ie.initialize()
1799 def _real_extract(self, query):
1800 mobj = re.match(self._VALID_QUERY, query)
1802 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1805 prefix, query = query.split(':')
1807 query = query.encode('utf-8')
1809 self._download_n_results(query, 1)
1811 elif prefix == 'all':
1812 self._download_n_results(query, self._max_google_results)
1818 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1820 elif n > self._max_google_results:
1821 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1822 n = self._max_google_results
1823 self._download_n_results(query, n)
1825 except ValueError: # parsing prefix as integer fails
1826 self._download_n_results(query, 1)
1829 def _download_n_results(self, query, n):
1830 """Downloads a specified number of results for a query"""
1833 already_seen = set()
1837 self.report_download_page(query, pagenum)
1838 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1839 request = urllib2.Request(result_url, None, std_headers)
1841 page = urllib2.urlopen(request).read()
1842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1846 # Extract video identifiers
1847 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1848 video_id = mobj.group(1)
1849 if video_id not in already_seen:
1850 video_ids.append(video_id)
1851 already_seen.add(video_id)
1852 if len(video_ids) == n:
1853 # Specified n videos reached
1854 for id in video_ids:
1855 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1858 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1859 for id in video_ids:
1860 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1863 pagenum = pagenum + 1
1865 class YahooSearchIE(InfoExtractor):
1866 """Information Extractor for Yahoo! Video search queries."""
1867 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1868 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1869 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1870 _MORE_PAGES_INDICATOR = r'\s*Next'
1872 _max_yahoo_results = 1000
1874 def __init__(self, yahoo_ie, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
1876 self._yahoo_ie = yahoo_ie
1880 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1882 def report_download_page(self, query, pagenum):
1883 """Report attempt to download playlist page with given number."""
1884 query = query.decode(preferredencoding())
1885 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1887 def _real_initialize(self):
1888 self._yahoo_ie.initialize()
1890 def _real_extract(self, query):
1891 mobj = re.match(self._VALID_QUERY, query)
1893 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1896 prefix, query = query.split(':')
1898 query = query.encode('utf-8')
1900 self._download_n_results(query, 1)
1902 elif prefix == 'all':
1903 self._download_n_results(query, self._max_yahoo_results)
1909 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1911 elif n > self._max_yahoo_results:
1912 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1913 n = self._max_yahoo_results
1914 self._download_n_results(query, n)
1916 except ValueError: # parsing prefix as integer fails
1917 self._download_n_results(query, 1)
1920 def _download_n_results(self, query, n):
1921 """Downloads a specified number of results for a query"""
1924 already_seen = set()
1928 self.report_download_page(query, pagenum)
1929 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1930 request = urllib2.Request(result_url, None, std_headers)
1932 page = urllib2.urlopen(request).read()
1933 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1937 # Extract video identifiers
1938 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1939 video_id = mobj.group(1)
1940 if video_id not in already_seen:
1941 video_ids.append(video_id)
1942 already_seen.add(video_id)
1943 if len(video_ids) == n:
1944 # Specified n videos reached
1945 for id in video_ids:
1946 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1949 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1950 for id in video_ids:
1951 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1954 pagenum = pagenum + 1
1956 class YoutubePlaylistIE(InfoExtractor):
1957 """Information Extractor for YouTube playlists."""
1959 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1960 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1961 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1962 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1965 def __init__(self, youtube_ie, downloader=None):
1966 InfoExtractor.__init__(self, downloader)
1967 self._youtube_ie = youtube_ie
1971 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1973 def report_download_page(self, playlist_id, pagenum):
1974 """Report attempt to download playlist page with given number."""
1975 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1977 def _real_initialize(self):
1978 self._youtube_ie.initialize()
1980 def _real_extract(self, url):
1981 # Extract playlist id
1982 mobj = re.match(self._VALID_URL, url)
1984 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1987 # Download playlist pages
1988 playlist_id = mobj.group(1)
1993 self.report_download_page(playlist_id, pagenum)
1994 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1996 page = urllib2.urlopen(request).read()
1997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2001 # Extract video identifiers
2003 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2004 if mobj.group(1) not in ids_in_page:
2005 ids_in_page.append(mobj.group(1))
2006 video_ids.extend(ids_in_page)
2008 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2010 pagenum = pagenum + 1
2012 playliststart = self._downloader.params.get('playliststart', 1) - 1
2013 playlistend = self._downloader.params.get('playlistend', -1)
2014 video_ids = video_ids[playliststart:playlistend]
2016 for id in video_ids:
2017 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2020 class YoutubeUserIE(InfoExtractor):
2021 """Information Extractor for YouTube users."""
2023 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2024 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2025 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2028 def __init__(self, youtube_ie, downloader=None):
2029 InfoExtractor.__init__(self, downloader)
2030 self._youtube_ie = youtube_ie
2034 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2036 def report_download_page(self, username):
2037 """Report attempt to download user page."""
2038 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2040 def _real_initialize(self):
2041 self._youtube_ie.initialize()
2043 def _real_extract(self, url):
2045 mobj = re.match(self._VALID_URL, url)
2047 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2050 # Download user page
2051 username = mobj.group(1)
2055 self.report_download_page(username)
2056 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2058 page = urllib2.urlopen(request).read()
2059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2060 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2063 # Extract video identifiers
2066 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2067 if mobj.group(1) not in ids_in_page:
2068 ids_in_page.append(mobj.group(1))
2069 video_ids.extend(ids_in_page)
2071 playliststart = self._downloader.params.get('playliststart', 1) - 1
2072 playlistend = self._downloader.params.get('playlistend', -1)
2073 video_ids = video_ids[playliststart:playlistend]
2075 for id in video_ids:
2076 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2079 class DepositFilesIE(InfoExtractor):
2080 """Information extractor for depositfiles.com"""
2082 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2084 def __init__(self, downloader=None):
2085 InfoExtractor.__init__(self, downloader)
2089 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2091 def report_download_webpage(self, file_id):
2092 """Report webpage download."""
2093 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2095 def report_extraction(self, file_id):
2096 """Report information extraction."""
2097 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2099 def _real_initialize(self):
2102 def _real_extract(self, url):
2103 # At this point we have a new file
2104 self._downloader.increment_downloads()
2106 file_id = url.split('/')[-1]
2107 # Rebuild url in english locale
2108 url = 'http://depositfiles.com/en/files/' + file_id
2110 # Retrieve file webpage with 'Free download' button pressed
2111 free_download_indication = { 'gateway_result' : '1' }
2112 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2114 self.report_download_webpage(file_id)
2115 webpage = urllib2.urlopen(request).read()
2116 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2120 # Search for the real file URL
2121 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2122 if (mobj is None) or (mobj.group(1) is None):
2123 # Try to figure out reason of the error.
2124 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2125 if (mobj is not None) and (mobj.group(1) is not None):
2126 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2127 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2129 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2132 file_url = mobj.group(1)
2133 file_extension = os.path.splitext(file_url)[1][1:]
2135 # Search for file title
2136 mobj = re.search(r'<b title="(.*?)">', webpage)
2138 self._downloader.trouble(u'ERROR: unable to extract title')
2140 file_title = mobj.group(1).decode('utf-8')
2143 # Process file information
2144 self._downloader.process_info({
2145 'id': file_id.decode('utf-8'),
2146 'url': file_url.decode('utf-8'),
2148 'upload_date': u'NA',
2149 'title': file_title,
2150 'stitle': file_title,
2151 'ext': file_extension.decode('utf-8'),
2155 except UnavailableVideoError, err:
2156 self._downloader.trouble(u'ERROR: unable to download file')
2158 class PostProcessor(object):
2159 """Post Processor class.
2161 PostProcessor objects can be added to downloaders with their
2162 add_post_processor() method. When the downloader has finished a
2163 successful download, it will take its internal chain of PostProcessors
2164 and start calling the run() method on each one of them, first with
2165 an initial argument and then with the returned value of the previous
2168 The chain will be stopped if one of them ever returns None or the end
2169 of the chain is reached.
2171 PostProcessor objects follow a "mutual registration" process similar
2172 to InfoExtractor objects.
2177 def __init__(self, downloader=None):
2178 self._downloader = downloader
2180 def set_downloader(self, downloader):
2181 """Sets the downloader for this PP."""
2182 self._downloader = downloader
2184 def run(self, information):
2185 """Run the PostProcessor.
2187 The "information" argument is a dictionary like the ones
2188 composed by InfoExtractors. The only difference is that this
2189 one has an extra field called "filepath" that points to the
2192 When this method returns None, the postprocessing chain is
2193 stopped. However, this method may return an information
2194 dictionary that will be passed to the next postprocessing
2195 object in the chain. It can be the one it received after
2196 changing some fields.
2198 In addition, this method may raise a PostProcessingError
2199 exception that will be taken into account by the downloader
2202 return information # by default, do nothing
2204 ### MAIN PROGRAM ###
2205 if __name__ == '__main__':
2207 # Modules needed only when running the main program
2211 # Function to update the program file with the latest version from bitbucket.org
2212 def update_self(downloader, filename):
2213 # Note: downloader only used for options
2214 if not os.access (filename, os.W_OK):
2215 sys.exit('ERROR: no write permissions on %s' % filename)
2217 downloader.to_screen('Updating to latest stable version...')
2218 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2219 latest_version = urllib.urlopen(latest_url).read().strip()
2220 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2221 newcontent = urllib.urlopen(prog_url).read()
2222 stream = open(filename, 'w')
2223 stream.write(newcontent)
2225 downloader.to_screen('Updated to version %s' % latest_version)
2227 # Parse command line
2228 parser = optparse.OptionParser(
2229 usage='Usage: %prog [options] url...',
2230 version='2010.11.19',
2231 conflict_handler='resolve',
2234 parser.add_option('-h', '--help',
2235 action='help', help='print this help text and exit')
2236 parser.add_option('-v', '--version',
2237 action='version', help='print program version and exit')
2238 parser.add_option('-U', '--update',
2239 action='store_true', dest='update_self', help='update this program to latest stable version')
2240 parser.add_option('-i', '--ignore-errors',
2241 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2242 parser.add_option('-r', '--rate-limit',
2243 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2244 parser.add_option('-R', '--retries',
2245 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2246 parser.add_option('--playlist-start',
2247 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2248 parser.add_option('--playlist-end',
2249 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2251 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2252 authentication.add_option('-u', '--username',
2253 dest='username', metavar='USERNAME', help='account username')
2254 authentication.add_option('-p', '--password',
2255 dest='password', metavar='PASSWORD', help='account password')
2256 authentication.add_option('-n', '--netrc',
2257 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2258 parser.add_option_group(authentication)
2260 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2261 video_format.add_option('-f', '--format',
2262 action='store', dest='format', metavar='FORMAT', help='video format code')
2263 video_format.add_option('-m', '--mobile-version',
2264 action='store_const', dest='format', help='alias for -f 17', const='17')
2265 video_format.add_option('--all-formats',
2266 action='store_const', dest='format', help='download all available video formats', const='-1')
2267 video_format.add_option('--max-quality',
2268 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2269 video_format.add_option('-b', '--best-quality',
2270 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2271 parser.add_option_group(video_format)
2273 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2274 verbosity.add_option('-q', '--quiet',
2275 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2276 verbosity.add_option('-s', '--simulate',
2277 action='store_true', dest='simulate', help='do not download video', default=False)
2278 verbosity.add_option('-g', '--get-url',
2279 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2280 verbosity.add_option('-e', '--get-title',
2281 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2282 verbosity.add_option('--get-thumbnail',
2283 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2284 verbosity.add_option('--get-description',
2285 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2286 verbosity.add_option('--no-progress',
2287 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2288 parser.add_option_group(verbosity)
2290 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2291 filesystem.add_option('-t', '--title',
2292 action='store_true', dest='usetitle', help='use title in file name', default=False)
2293 filesystem.add_option('-l', '--literal',
2294 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2295 filesystem.add_option('-A', '--auto-number',
2296 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2297 filesystem.add_option('-o', '--output',
2298 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2299 filesystem.add_option('-a', '--batch-file',
2300 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2301 filesystem.add_option('-w', '--no-overwrites',
2302 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2303 filesystem.add_option('-c', '--continue',
2304 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2305 filesystem.add_option('--cookies',
2306 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2307 parser.add_option_group(filesystem)
2309 (opts, args) = parser.parse_args()
2311 # Open appropriate CookieJar
2312 if opts.cookiefile is None:
2313 jar = cookielib.CookieJar()
2316 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2317 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2319 except (IOError, OSError), err:
2320 sys.exit(u'ERROR: unable to open cookie file')
2322 # General configuration
2323 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2324 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2325 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2326 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2328 # Batch file verification
2330 if opts.batchfile is not None:
2332 if opts.batchfile == '-':
2335 batchfd = open(opts.batchfile, 'r')
2336 batchurls = batchfd.readlines()
2337 batchurls = [x.strip() for x in batchurls]
2338 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2340 sys.exit(u'ERROR: batch file could not be read')
2341 all_urls = batchurls + args
2343 # Conflicting, missing and erroneous options
2344 if opts.bestquality:
2345 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2346 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2347 parser.error(u'using .netrc conflicts with giving username/password')
2348 if opts.password is not None and opts.username is None:
2349 parser.error(u'account username missing')
2350 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2351 parser.error(u'using output template conflicts with using title, literal title or auto number')
2352 if opts.usetitle and opts.useliteral:
2353 parser.error(u'using title conflicts with using literal title')
2354 if opts.username is not None and opts.password is None:
2355 opts.password = getpass.getpass(u'Type account password and press return:')
2356 if opts.ratelimit is not None:
2357 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2358 if numeric_limit is None:
2359 parser.error(u'invalid rate limit specified')
2360 opts.ratelimit = numeric_limit
2361 if opts.retries is not None:
2363 opts.retries = long(opts.retries)
2364 except (TypeError, ValueError), err:
2365 parser.error(u'invalid retry count specified')
2367 opts.playliststart = long(opts.playliststart)
2368 if opts.playliststart <= 0:
2370 except (TypeError, ValueError), err:
2371 parser.error(u'invalid playlist start number specified')
2373 opts.playlistend = long(opts.playlistend)
2374 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2376 except (TypeError, ValueError), err:
2377 parser.error(u'invalid playlist end number specified')
2379 # Information extractors
2380 youtube_ie = YoutubeIE()
2381 metacafe_ie = MetacafeIE(youtube_ie)
2382 dailymotion_ie = DailymotionIE()
2383 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2384 youtube_user_ie = YoutubeUserIE(youtube_ie)
2385 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2386 google_ie = GoogleIE()
2387 google_search_ie = GoogleSearchIE(google_ie)
2388 photobucket_ie = PhotobucketIE()
2389 yahoo_ie = YahooIE()
2390 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2391 deposit_files_ie = DepositFilesIE()
2392 generic_ie = GenericIE()
2395 fd = FileDownloader({
2396 'usenetrc': opts.usenetrc,
2397 'username': opts.username,
2398 'password': opts.password,
2399 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2400 'forceurl': opts.geturl,
2401 'forcetitle': opts.gettitle,
2402 'forcethumbnail': opts.getthumbnail,
2403 'forcedescription': opts.getdescription,
2404 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2405 'format': opts.format,
2406 'format_limit': opts.format_limit,
2407 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2408 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2409 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2410 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2411 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2412 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2413 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2414 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2415 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2416 or u'%(id)s.%(ext)s'),
2417 'ignoreerrors': opts.ignoreerrors,
2418 'ratelimit': opts.ratelimit,
2419 'nooverwrites': opts.nooverwrites,
2420 'retries': opts.retries,
2421 'continuedl': opts.continue_dl,
2422 'noprogress': opts.noprogress,
2423 'playliststart': opts.playliststart,
2424 'playlistend': opts.playlistend,
2425 'logtostderr': opts.outtmpl == '-',
2427 fd.add_info_extractor(youtube_search_ie)
2428 fd.add_info_extractor(youtube_pl_ie)
2429 fd.add_info_extractor(youtube_user_ie)
2430 fd.add_info_extractor(metacafe_ie)
2431 fd.add_info_extractor(dailymotion_ie)
2432 fd.add_info_extractor(youtube_ie)
2433 fd.add_info_extractor(google_ie)
2434 fd.add_info_extractor(google_search_ie)
2435 fd.add_info_extractor(photobucket_ie)
2436 fd.add_info_extractor(yahoo_ie)
2437 fd.add_info_extractor(yahoo_search_ie)
2438 fd.add_info_extractor(deposit_files_ie)
2440 # This must come last since it's the
2441 # fallback if none of the others work
2442 fd.add_info_extractor(generic_ie)
2445 if opts.update_self:
2446 update_self(fd, sys.argv[0])
2449 if len(all_urls) < 1:
2450 if not opts.update_self:
2451 parser.error(u'you must provide at least one URL')
2454 retcode = fd.download(all_urls)
2456 # Dump cookie jar if requested
2457 if opts.cookiefile is not None:
2460 except (IOError, OSError), err:
2461 sys.exit(u'ERROR: unable to save cookie jar')
2465 except DownloadError:
2467 except SameFileError:
2468 sys.exit(u'ERROR: fixed output name but more than one file to download')
2469 except KeyboardInterrupt:
2470 sys.exit(u'\nERROR: Interrupted by user')