2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
26 # parse_qs was moved from the cgi module to the urlparse module recently.
28 from urlparse import parse_qs
30 from cgi import parse_qs
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Language': 'en-us,en;q=0.5',
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
41 def preferredencoding():
42 """Get preferred encoding.
44 Returns the best encoding scheme for the system, based on
45 locale.getpreferredencoding() and some further tweaks.
47 def yield_preferredencoding():
49 pref = locale.getpreferredencoding()
55 return yield_preferredencoding().next()
57 def htmlentity_transform(matchobj):
58 """Transforms an HTML entity to a Unicode character.
60 This function receives a match object and is intended to be used with
61 the re.sub() function.
63 entity = matchobj.group(1)
65 # Known non-numeric HTML entity
66 if entity in htmlentitydefs.name2codepoint:
67 return unichr(htmlentitydefs.name2codepoint[entity])
70 mobj = re.match(ur'(?u)#(x?\d+)', entity)
72 numstr = mobj.group(1)
73 if numstr.startswith(u'x'):
75 numstr = u'0%s' % numstr
78 return unichr(long(numstr, base))
80 # Unknown entity in name, return its literal representation
81 return (u'&%s;' % entity)
83 def sanitize_title(utitle):
84 """Sanitizes a video title so it could be used as part of a filename."""
85 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86 return utitle.replace(unicode(os.sep), u'%')
88 def sanitize_open(filename, open_mode):
89 """Try to open the given filename, and slightly tweak it if this fails.
91 Attempts to open the given filename. If this fails, it tries to change
92 the filename slightly, step by step, until it's either able to open it
93 or it fails and raises a final exception, like the standard open()
96 It returns the tuple (stream, definitive_file_name).
100 if sys.platform == 'win32':
102 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103 return (sys.stdout, filename)
104 stream = open(filename, open_mode)
105 return (stream, filename)
106 except (IOError, OSError), err:
107 # In case of error, try to remove win32 forbidden chars
108 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
110 # An exception here should be caught in the caller
111 stream = open(filename, open_mode)
112 return (stream, filename)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self, downloaded, expected):
159 self.downloaded = downloaded
160 self.expected = expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode = None
216 _num_downloads = None
219 def __init__(self, params):
220 """Create a FileDownloader object with the given options."""
223 self._download_retcode = 0
224 self._num_downloads = 0
225 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
229 def pmkdir(filename):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components = filename.split(os.sep)
232 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234 for dir in aggregate:
235 if not os.path.exists(dir):
239 def temp_name(filename):
240 """Returns a temporary filename for the given filename."""
241 return filename + '.part'
244 def format_bytes(bytes):
247 if type(bytes) is str:
252 exponent = long(math.log(bytes, 1024.0))
253 suffix = 'bkMGTPEZY'[exponent]
254 converted = float(bytes) / float(1024**exponent)
255 return '%.2f%s' % (converted, suffix)
258 def calc_percent(byte_counter, data_len):
261 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
264 def calc_eta(start, now, total, current):
268 if current == 0 or dif < 0.001: # One millisecond
270 rate = float(current) / dif
271 eta = long((float(total) - float(current)) / rate)
272 (eta_mins, eta_secs) = divmod(eta, 60)
275 return '%02d:%02d' % (eta_mins, eta_secs)
278 def calc_speed(start, now, bytes):
280 if bytes == 0 or dif < 0.001: # One millisecond
281 return '%10s' % '---b/s'
282 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
285 def best_block_size(elapsed_time, bytes):
286 new_min = max(bytes / 2.0, 1.0)
287 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
288 if elapsed_time < 0.001:
290 rate = bytes / elapsed_time
298 def parse_bytes(bytestr):
299 """Parse a string indicating a byte quantity into a long integer."""
300 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
303 number = float(matchobj.group(1))
304 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
305 return long(round(number * multiplier))
307 def add_info_extractor(self, ie):
308 """Add an InfoExtractor object to the end of the list."""
310 ie.set_downloader(self)
312 def add_post_processor(self, pp):
313 """Add a PostProcessor object to the end of the chain."""
315 pp.set_downloader(self)
317 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
318 """Print message to stdout if not in quiet mode."""
320 if not self.params.get('quiet', False):
321 terminator = [u'\n', u''][skip_eol]
322 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
323 self._screen_file.flush()
324 except (UnicodeEncodeError), err:
325 if not ignore_encoding_errors:
328 def to_stderr(self, message):
329 """Print message to stderr."""
330 print >>sys.stderr, message.encode(preferredencoding())
332 def fixed_template(self):
333 """Checks if the output template is fixed."""
334 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
336 def trouble(self, message=None):
337 """Determine action to take when a download problem appears.
339 Depending on if the downloader has been configured to ignore
340 download errors or not, this method may throw an exception or
341 not when errors are found, after printing the message.
343 if message is not None:
344 self.to_stderr(message)
345 if not self.params.get('ignoreerrors', False):
346 raise DownloadError(message)
347 self._download_retcode = 1
349 def slow_down(self, start_time, byte_counter):
350 """Sleep if the download speed is over the rate limit."""
351 rate_limit = self.params.get('ratelimit', None)
352 if rate_limit is None or byte_counter == 0:
355 elapsed = now - start_time
358 speed = float(byte_counter) / elapsed
359 if speed > rate_limit:
360 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
362 def try_rename(self, old_filename, new_filename):
364 os.rename(old_filename, new_filename)
365 except (IOError, OSError), err:
366 self.trouble(u'ERROR: unable to rename file')
368 def report_destination(self, filename):
369 """Report destination filename."""
370 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
372 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
373 """Report download progress."""
374 if self.params.get('noprogress', False):
376 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
377 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
379 def report_resuming_byte(self, resume_len):
380 """Report attempt to resume at given byte."""
381 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
383 def report_retry(self, count, retries):
384 """Report retry in case of HTTP error 5xx"""
385 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
387 def report_file_already_downloaded(self, file_name):
388 """Report file has already been fully downloaded."""
390 self.to_screen(u'[download] %s has already been downloaded' % file_name)
391 except (UnicodeEncodeError), err:
392 self.to_screen(u'[download] The file has already been downloaded')
394 def report_unable_to_resume(self):
395 """Report it was impossible to resume download."""
396 self.to_screen(u'[download] Unable to resume')
398 def report_finish(self):
399 """Report download finished."""
400 if self.params.get('noprogress', False):
401 self.to_screen(u'[download] Download completed')
405 def increment_downloads(self):
406 """Increment the ordinal that assigns a number to each file."""
407 self._num_downloads += 1
409 def process_info(self, info_dict):
410 """Process a single dictionary returned by an InfoExtractor."""
411 # Do nothing else if in simulate mode
412 if self.params.get('simulate', False):
414 if self.params.get('forcetitle', False):
415 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
416 if self.params.get('forceurl', False):
417 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
418 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
419 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self.params.get('forcedescription', False) and 'description' in info_dict:
421 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
426 template_dict = dict(info_dict)
427 template_dict['epoch'] = unicode(long(time.time()))
428 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
429 filename = self.params['outtmpl'] % template_dict
430 except (ValueError, KeyError), err:
431 self.trouble(u'ERROR: invalid system charset or erroneous output template')
433 if self.params.get('nooverwrites', False) and os.path.exists(filename):
434 self.to_stderr(u'WARNING: file exists and will be skipped')
438 self.pmkdir(filename)
439 except (OSError, IOError), err:
440 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
444 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
445 except (OSError, IOError), err:
446 raise UnavailableVideoError
447 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
448 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
450 except (ContentTooShortError, ), err:
451 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
456 self.post_process(filename, info_dict)
457 except (PostProcessingError), err:
458 self.trouble(u'ERROR: postprocessing: %s' % str(err))
461 def download(self, url_list):
462 """Download a given list of URLs."""
463 if len(url_list) > 1 and self.fixed_template():
464 raise SameFileError(self.params['outtmpl'])
467 suitable_found = False
469 # Go to next InfoExtractor if not suitable
470 if not ie.suitable(url):
473 # Suitable InfoExtractor found
474 suitable_found = True
476 # Extract information from URL and process it
479 # Suitable InfoExtractor had been found; go to next URL
482 if not suitable_found:
483 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
485 return self._download_retcode
487 def post_process(self, filename, ie_info):
488 """Run the postprocessing chain on the given file."""
490 info['filepath'] = filename
496 def _download_with_rtmpdump(self, filename, url, player_url):
497 self.report_destination(filename)
498 tmpfilename = self.temp_name(filename)
500 # Check for rtmpdump first
502 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
503 except (OSError, IOError):
504 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
507 # Download using rtmpdump. rtmpdump returns exit code 2 when
508 # the connection was interrumpted and resuming appears to be
509 # possible. This is part of rtmpdump's normal usage, AFAIK.
510 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
511 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
512 while retval == 2 or retval == 1:
513 prevsize = os.path.getsize(tmpfilename)
514 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
515 time.sleep(5.0) # This seems to be needed
516 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
517 cursize = os.path.getsize(tmpfilename)
518 if prevsize == cursize and retval == 1:
521 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
522 self.try_rename(tmpfilename, filename)
525 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
528 def _do_download(self, filename, url, player_url):
529 # Check file already present
530 if self.params.get('continuedl', False) and os.path.isfile(filename):
531 self.report_file_already_downloaded(filename)
534 # Attempt to download using rtmpdump
535 if url.startswith('rtmp'):
536 return self._download_with_rtmpdump(filename, url, player_url)
538 tmpfilename = self.temp_name(filename)
541 basic_request = urllib2.Request(url, None, std_headers)
542 request = urllib2.Request(url, None, std_headers)
544 # Establish possible resume length
545 if os.path.isfile(tmpfilename):
546 resume_len = os.path.getsize(tmpfilename)
550 # Request parameters in case of being able to resume
551 if self.params.get('continuedl', False) and resume_len != 0:
552 self.report_resuming_byte(resume_len)
553 request.add_header('Range','bytes=%d-' % resume_len)
557 retries = self.params.get('retries', 0)
558 while count <= retries:
559 # Establish connection
561 data = urllib2.urlopen(request)
563 except (urllib2.HTTPError, ), err:
564 if (err.code < 500 or err.code >= 600) and err.code != 416:
565 # Unexpected HTTP error
567 elif err.code == 416:
568 # Unable to resume (requested range not satisfiable)
570 # Open the connection again without the range header
571 data = urllib2.urlopen(basic_request)
572 content_length = data.info()['Content-Length']
573 except (urllib2.HTTPError, ), err:
574 if err.code < 500 or err.code >= 600:
577 # Examine the reported length
578 if (content_length is not None and
579 (resume_len - 100 < long(content_length) < resume_len + 100)):
580 # The file had already been fully downloaded.
581 # Explanation to the above condition: in issue #175 it was revealed that
582 # YouTube sometimes adds or removes a few bytes from the end of the file,
583 # changing the file size slightly and causing problems for some users. So
584 # I decided to implement a suggested change and consider the file
585 # completely downloaded if the file size differs less than 100 bytes from
586 # the one in the hard drive.
587 self.report_file_already_downloaded(filename)
588 self.try_rename(tmpfilename, filename)
591 # The length does not match, we start the download over
592 self.report_unable_to_resume()
598 self.report_retry(count, retries)
601 self.trouble(u'ERROR: giving up after %s retries' % retries)
604 data_len = data.info().get('Content-length', None)
605 data_len_str = self.format_bytes(data_len)
612 data_block = data.read(block_size)
614 data_block_len = len(data_block)
615 if data_block_len == 0:
617 byte_counter += data_block_len
619 # Open file just in time
622 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
623 self.report_destination(filename)
624 except (OSError, IOError), err:
625 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
628 stream.write(data_block)
629 except (IOError, OSError), err:
630 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
632 block_size = self.best_block_size(after - before, data_block_len)
635 percent_str = self.calc_percent(byte_counter, data_len)
636 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
637 speed_str = self.calc_speed(start, time.time(), byte_counter)
638 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
641 self.slow_down(start, byte_counter)
645 if data_len is not None and str(byte_counter) != data_len:
646 raise ContentTooShortError(byte_counter, long(data_len))
647 self.try_rename(tmpfilename, filename)
650 class InfoExtractor(object):
651 """Information Extractor class.
653 Information extractors are the classes that, given a URL, extract
654 information from the video (or videos) the URL refers to. This
655 information includes the real video URL, the video title and simplified
656 title, author and others. The information is stored in a dictionary
657 which is then passed to the FileDownloader. The FileDownloader
658 processes this information possibly downloading the video to the file
659 system, among other possible outcomes. The dictionaries must include
660 the following fields:
662 id: Video identifier.
663 url: Final video URL.
664 uploader: Nickname of the video uploader.
665 title: Literal title.
666 stitle: Simplified title.
667 ext: Video filename extension.
668 format: Video format.
669 player_url: SWF Player URL (may be None).
671 The following fields are optional. Their primary purpose is to allow
672 youtube-dl to serve as the backend for a video search function, such
673 as the one in youtube2mp3. They are only used when their respective
674 forced printing functions are called:
676 thumbnail: Full URL to a video thumbnail image.
677 description: One-line video description.
679 Subclasses of this one should re-define the _real_initialize() and
680 _real_extract() methods, as well as the suitable() static method.
681 Probably, they should also be instantiated and added to the main
688 def __init__(self, downloader=None):
689 """Constructor. Receives an optional downloader."""
691 self.set_downloader(downloader)
695 """Receives a URL and returns True if suitable for this IE."""
698 def initialize(self):
699 """Initializes an instance (authentication, etc)."""
701 self._real_initialize()
704 def extract(self, url):
705 """Extracts URL information and returns it in list of dicts."""
707 return self._real_extract(url)
709 def set_downloader(self, downloader):
710 """Sets the downloader for this IE."""
711 self._downloader = downloader
713 def _real_initialize(self):
714 """Real initialization process. Redefine in subclasses."""
717 def _real_extract(self, url):
718 """Real extraction process. Redefine in subclasses."""
721 class YoutubeIE(InfoExtractor):
722 """Information extractor for youtube.com."""
724 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
725 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
726 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
727 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
728 _NETRC_MACHINE = 'youtube'
729 # Listed in order of quality
730 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
731 _video_extensions = {
737 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
744 return (re.match(YoutubeIE._VALID_URL, url) is not None)
746 def report_lang(self):
747 """Report attempt to set language."""
748 self._downloader.to_screen(u'[youtube] Setting language')
750 def report_login(self):
751 """Report attempt to log in."""
752 self._downloader.to_screen(u'[youtube] Logging in')
754 def report_age_confirmation(self):
755 """Report attempt to confirm age."""
756 self._downloader.to_screen(u'[youtube] Confirming age')
758 def report_video_webpage_download(self, video_id):
759 """Report attempt to download video webpage."""
760 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
762 def report_video_info_webpage_download(self, video_id):
763 """Report attempt to download video info webpage."""
764 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
766 def report_information_extraction(self, video_id):
767 """Report attempt to extract video information."""
768 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
770 def report_unavailable_format(self, video_id, format):
771 """Report extracted video URL."""
772 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
774 def report_rtmp_download(self):
775 """Indicate the download will use the RTMP protocol."""
776 self._downloader.to_screen(u'[youtube] RTMP download detected')
778 def _real_initialize(self):
779 if self._downloader is None:
784 downloader_params = self._downloader.params
786 # Attempt to use provided username and password or .netrc data
787 if downloader_params.get('username', None) is not None:
788 username = downloader_params['username']
789 password = downloader_params['password']
790 elif downloader_params.get('usenetrc', False):
792 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
797 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
798 except (IOError, netrc.NetrcParseError), err:
799 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
803 request = urllib2.Request(self._LANG_URL, None, std_headers)
806 urllib2.urlopen(request).read()
807 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
808 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
811 # No authentication to be performed
817 'current_form': 'loginForm',
819 'action_login': 'Log In',
820 'username': username,
821 'password': password,
823 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
826 login_results = urllib2.urlopen(request).read()
827 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
828 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
830 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
831 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
837 'action_confirm': 'Confirm',
839 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
841 self.report_age_confirmation()
842 age_results = urllib2.urlopen(request).read()
843 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
844 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
847 def _real_extract(self, url):
848 # Extract video id from URL
849 mobj = re.match(self._VALID_URL, url)
851 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
853 video_id = mobj.group(2)
856 self.report_video_webpage_download(video_id)
857 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
859 video_webpage = urllib2.urlopen(request).read()
860 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
864 # Attempt to extract SWF player URL
865 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
867 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
872 self.report_video_info_webpage_download(video_id)
873 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
874 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
875 % (video_id, el_type))
876 request = urllib2.Request(video_info_url, None, std_headers)
878 video_info_webpage = urllib2.urlopen(request).read()
879 video_info = parse_qs(video_info_webpage)
880 if 'token' in video_info:
882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
883 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
885 if 'token' not in video_info:
886 if 'reason' in video_info:
887 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
889 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
892 # Start extracting information
893 self.report_information_extraction(video_id)
896 if 'author' not in video_info:
897 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
899 video_uploader = urllib.unquote_plus(video_info['author'][0])
902 if 'title' not in video_info:
903 self._downloader.trouble(u'ERROR: unable to extract video title')
905 video_title = urllib.unquote_plus(video_info['title'][0])
906 video_title = video_title.decode('utf-8')
907 video_title = sanitize_title(video_title)
910 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
911 simple_title = simple_title.strip(ur'_')
914 if 'thumbnail_url' not in video_info:
915 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
917 else: # don't panic if we can't find it
918 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
922 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
924 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
925 format_expressions = ['%d %B %Y', '%B %d %Y']
926 for expression in format_expressions:
928 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
933 video_description = 'No description available.'
934 if self._downloader.params.get('forcedescription', False):
935 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
937 video_description = mobj.group(1)
940 video_token = urllib.unquote_plus(video_info['token'][0])
942 # Decide which formats to download
943 requested_format = self._downloader.params.get('format', None)
944 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
946 if 'fmt_url_map' in video_info:
947 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
948 format_limit = self._downloader.params.get('format_limit', None)
949 if format_limit is not None and format_limit in self._available_formats:
950 format_list = self._available_formats[self._available_formats.index(format_limit):]
952 format_list = self._available_formats
953 existing_formats = [x for x in format_list if x in url_map]
954 if len(existing_formats) == 0:
955 self._downloader.trouble(u'ERROR: no known formats available for video')
957 if requested_format is None:
958 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
959 elif requested_format == '-1':
960 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
962 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
964 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
965 self.report_rtmp_download()
966 video_url_list = [(None, video_info['conn'][0])]
969 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
972 for format_param, video_real_url in video_url_list:
973 # At this point we have a new video
974 self._downloader.increment_downloads()
977 video_extension = self._video_extensions.get(format_param, 'flv')
979 # Find the video URL in fmt_url_map or conn paramters
981 # Process video information
982 self._downloader.process_info({
983 'id': video_id.decode('utf-8'),
984 'url': video_real_url.decode('utf-8'),
985 'uploader': video_uploader.decode('utf-8'),
986 'upload_date': upload_date,
987 'title': video_title,
988 'stitle': simple_title,
989 'ext': video_extension.decode('utf-8'),
990 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
991 'thumbnail': video_thumbnail.decode('utf-8'),
992 'description': video_description.decode('utf-8'),
993 'player_url': player_url,
995 except UnavailableVideoError, err:
996 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
999 class MetacafeIE(InfoExtractor):
1000 """Information Extractor for metacafe.com."""
1002 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1003 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1004 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1007 def __init__(self, youtube_ie, downloader=None):
1008 InfoExtractor.__init__(self, downloader)
1009 self._youtube_ie = youtube_ie
1013 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1015 def report_disclaimer(self):
1016 """Report disclaimer retrieval."""
1017 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1019 def report_age_confirmation(self):
1020 """Report attempt to confirm age."""
1021 self._downloader.to_screen(u'[metacafe] Confirming age')
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1031 def _real_initialize(self):
1032 # Retrieve disclaimer
1033 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1035 self.report_disclaimer()
1036 disclaimer = urllib2.urlopen(request).read()
1037 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1044 'submit': "Continue - I'm over 18",
1046 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1048 self.report_age_confirmation()
1049 disclaimer = urllib2.urlopen(request).read()
1050 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1051 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1054 def _real_extract(self, url):
1055 # Extract id and simplified title from URL
1056 mobj = re.match(self._VALID_URL, url)
1058 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1061 video_id = mobj.group(1)
1063 # Check if video comes from YouTube
1064 mobj2 = re.match(r'^yt-(.*)$', video_id)
1065 if mobj2 is not None:
1066 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1069 # At this point we have a new video
1070 self._downloader.increment_downloads()
1072 simple_title = mobj.group(2).decode('utf-8')
1074 # Retrieve video webpage to extract further information
1075 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1077 self.report_download_webpage(video_id)
1078 webpage = urllib2.urlopen(request).read()
1079 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1080 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1083 # Extract URL, uploader and title from webpage
1084 self.report_extraction(video_id)
1085 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1086 if mobj is not None:
1087 mediaURL = urllib.unquote(mobj.group(1))
1088 video_extension = mediaURL[-3:]
1090 # Extract gdaKey if available
1091 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1093 video_url = mediaURL
1095 gdaKey = mobj.group(1)
1096 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1098 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1100 self._downloader.trouble(u'ERROR: unable to extract media URL')
1102 vardict = parse_qs(mobj.group(1))
1103 if 'mediaData' not in vardict:
1104 self._downloader.trouble(u'ERROR: unable to extract media URL')
1106 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1108 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110 mediaURL = mobj.group(1).replace('\\/', '/')
1111 video_extension = mediaURL[-3:]
1112 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1114 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1116 self._downloader.trouble(u'ERROR: unable to extract title')
1118 video_title = mobj.group(1).decode('utf-8')
1119 video_title = sanitize_title(video_title)
1121 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1123 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1125 video_uploader = mobj.group(1)
1128 # Process video information
1129 self._downloader.process_info({
1130 'id': video_id.decode('utf-8'),
1131 'url': video_url.decode('utf-8'),
1132 'uploader': video_uploader.decode('utf-8'),
1133 'upload_date': u'NA',
1134 'title': video_title,
1135 'stitle': simple_title,
1136 'ext': video_extension.decode('utf-8'),
1140 except UnavailableVideoError:
1141 self._downloader.trouble(u'ERROR: unable to download video')
1144 class DailymotionIE(InfoExtractor):
1145 """Information Extractor for Dailymotion"""
1147 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1154 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1156 def report_download_webpage(self, video_id):
1157 """Report webpage download."""
1158 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1160 def report_extraction(self, video_id):
1161 """Report information extraction."""
1162 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1164 def _real_initialize(self):
1167 def _real_extract(self, url):
1168 # Extract id and simplified title from URL
1169 mobj = re.match(self._VALID_URL, url)
1171 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1174 # At this point we have a new video
1175 self._downloader.increment_downloads()
1176 video_id = mobj.group(1)
1178 simple_title = mobj.group(2).decode('utf-8')
1179 video_extension = 'flv'
1181 # Retrieve video webpage to extract further information
1182 request = urllib2.Request(url)
1184 self.report_download_webpage(video_id)
1185 webpage = urllib2.urlopen(request).read()
1186 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1187 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1190 # Extract URL, uploader and title from webpage
1191 self.report_extraction(video_id)
1192 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1194 self._downloader.trouble(u'ERROR: unable to extract media URL')
1196 mediaURL = urllib.unquote(mobj.group(1))
1198 # if needed add http://www.dailymotion.com/ if relative URL
1200 video_url = mediaURL
1202 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1203 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1205 self._downloader.trouble(u'ERROR: unable to extract title')
1207 video_title = mobj.group(1).decode('utf-8')
1208 video_title = sanitize_title(video_title)
1210 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1212 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1214 video_uploader = mobj.group(1)
1217 # Process video information
1218 self._downloader.process_info({
1219 'id': video_id.decode('utf-8'),
1220 'url': video_url.decode('utf-8'),
1221 'uploader': video_uploader.decode('utf-8'),
1222 'upload_date': u'NA',
1223 'title': video_title,
1224 'stitle': simple_title,
1225 'ext': video_extension.decode('utf-8'),
1229 except UnavailableVideoError:
1230 self._downloader.trouble(u'ERROR: unable to download video')
1232 class GoogleIE(InfoExtractor):
1233 """Information extractor for video.google.com."""
1235 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1237 def __init__(self, downloader=None):
1238 InfoExtractor.__init__(self, downloader)
1242 return (re.match(GoogleIE._VALID_URL, url) is not None)
1244 def report_download_webpage(self, video_id):
1245 """Report webpage download."""
1246 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1248 def report_extraction(self, video_id):
1249 """Report information extraction."""
1250 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1252 def _real_initialize(self):
1255 def _real_extract(self, url):
1256 # Extract id from URL
1257 mobj = re.match(self._VALID_URL, url)
1259 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1262 # At this point we have a new video
1263 self._downloader.increment_downloads()
1264 video_id = mobj.group(1)
1266 video_extension = 'mp4'
1268 # Retrieve video webpage to extract further information
1269 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1271 self.report_download_webpage(video_id)
1272 webpage = urllib2.urlopen(request).read()
1273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1274 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1277 # Extract URL, uploader, and title from webpage
1278 self.report_extraction(video_id)
1279 mobj = re.search(r"download_url:'([^']+)'", webpage)
1281 video_extension = 'flv'
1282 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1284 self._downloader.trouble(u'ERROR: unable to extract media URL')
1286 mediaURL = urllib.unquote(mobj.group(1))
1287 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1288 mediaURL = mediaURL.replace('\\x26', '\x26')
1290 video_url = mediaURL
1292 mobj = re.search(r'<title>(.*)</title>', webpage)
1294 self._downloader.trouble(u'ERROR: unable to extract title')
1296 video_title = mobj.group(1).decode('utf-8')
1297 video_title = sanitize_title(video_title)
1298 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1300 # Extract video description
1301 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1303 self._downloader.trouble(u'ERROR: unable to extract video description')
1305 video_description = mobj.group(1).decode('utf-8')
1306 if not video_description:
1307 video_description = 'No description available.'
1309 # Extract video thumbnail
1310 if self._downloader.params.get('forcethumbnail', False):
1311 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1313 webpage = urllib2.urlopen(request).read()
1314 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1315 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1317 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1319 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1321 video_thumbnail = mobj.group(1)
1322 else: # we need something to pass to process_info
1323 video_thumbnail = ''
1327 # Process video information
1328 self._downloader.process_info({
1329 'id': video_id.decode('utf-8'),
1330 'url': video_url.decode('utf-8'),
1332 'upload_date': u'NA',
1333 'title': video_title,
1334 'stitle': simple_title,
1335 'ext': video_extension.decode('utf-8'),
1339 except UnavailableVideoError:
1340 self._downloader.trouble(u'ERROR: unable to download video')
1343 class PhotobucketIE(InfoExtractor):
1344 """Information extractor for photobucket.com."""
1346 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1348 def __init__(self, downloader=None):
1349 InfoExtractor.__init__(self, downloader)
1353 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1355 def report_download_webpage(self, video_id):
1356 """Report webpage download."""
1357 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1359 def report_extraction(self, video_id):
1360 """Report information extraction."""
1361 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1363 def _real_initialize(self):
1366 def _real_extract(self, url):
1367 # Extract id from URL
1368 mobj = re.match(self._VALID_URL, url)
1370 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1373 # At this point we have a new video
1374 self._downloader.increment_downloads()
1375 video_id = mobj.group(1)
1377 video_extension = 'flv'
1379 # Retrieve video webpage to extract further information
1380 request = urllib2.Request(url)
1382 self.report_download_webpage(video_id)
1383 webpage = urllib2.urlopen(request).read()
1384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1385 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1388 # Extract URL, uploader, and title from webpage
1389 self.report_extraction(video_id)
1390 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1392 self._downloader.trouble(u'ERROR: unable to extract media URL')
1394 mediaURL = urllib.unquote(mobj.group(1))
1396 video_url = mediaURL
1398 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1400 self._downloader.trouble(u'ERROR: unable to extract title')
1402 video_title = mobj.group(1).decode('utf-8')
1403 video_title = sanitize_title(video_title)
1404 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1406 video_uploader = mobj.group(2).decode('utf-8')
1409 # Process video information
1410 self._downloader.process_info({
1411 'id': video_id.decode('utf-8'),
1412 'url': video_url.decode('utf-8'),
1413 'uploader': video_uploader,
1414 'upload_date': u'NA',
1415 'title': video_title,
1416 'stitle': simple_title,
1417 'ext': video_extension.decode('utf-8'),
1421 except UnavailableVideoError:
1422 self._downloader.trouble(u'ERROR: unable to download video')
1425 class YahooIE(InfoExtractor):
1426 """Information extractor for video.yahoo.com."""
1428 # _VALID_URL matches all Yahoo! Video URLs
1429 # _VPAGE_URL matches only the extractable '/watch/' URLs
1430 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1431 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1433 def __init__(self, downloader=None):
1434 InfoExtractor.__init__(self, downloader)
1438 return (re.match(YahooIE._VALID_URL, url) is not None)
1440 def report_download_webpage(self, video_id):
1441 """Report webpage download."""
1442 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1444 def report_extraction(self, video_id):
1445 """Report information extraction."""
1446 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1448 def _real_initialize(self):
1451 def _real_extract(self, url, new_video=True):
1452 # Extract ID from URL
1453 mobj = re.match(self._VALID_URL, url)
1455 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1458 # At this point we have a new video
1459 self._downloader.increment_downloads()
1460 video_id = mobj.group(2)
1461 video_extension = 'flv'
1463 # Rewrite valid but non-extractable URLs as
1464 # extractable English language /watch/ URLs
1465 if re.match(self._VPAGE_URL, url) is None:
1466 request = urllib2.Request(url)
1468 webpage = urllib2.urlopen(request).read()
1469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1473 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1475 self._downloader.trouble(u'ERROR: Unable to extract id field')
1477 yahoo_id = mobj.group(1)
1479 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1481 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1483 yahoo_vid = mobj.group(1)
1485 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1486 return self._real_extract(url, new_video=False)
1488 # Retrieve video webpage to extract further information
1489 request = urllib2.Request(url)
1491 self.report_download_webpage(video_id)
1492 webpage = urllib2.urlopen(request).read()
1493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1497 # Extract uploader and title from webpage
1498 self.report_extraction(video_id)
1499 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1501 self._downloader.trouble(u'ERROR: unable to extract video title')
1503 video_title = mobj.group(1).decode('utf-8')
1504 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1506 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1508 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1510 video_uploader = mobj.group(1).decode('utf-8')
1512 # Extract video thumbnail
1513 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1515 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1517 video_thumbnail = mobj.group(1).decode('utf-8')
1519 # Extract video description
1520 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1522 self._downloader.trouble(u'ERROR: unable to extract video description')
1524 video_description = mobj.group(1).decode('utf-8')
1525 if not video_description: video_description = 'No description available.'
1527 # Extract video height and width
1528 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1530 self._downloader.trouble(u'ERROR: unable to extract video height')
1532 yv_video_height = mobj.group(1)
1534 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1536 self._downloader.trouble(u'ERROR: unable to extract video width')
1538 yv_video_width = mobj.group(1)
1540 # Retrieve video playlist to extract media URL
1541 # I'm not completely sure what all these options are, but we
1542 # seem to need most of them, otherwise the server sends a 401.
1543 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1544 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1545 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1546 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1547 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1549 self.report_download_webpage(video_id)
1550 webpage = urllib2.urlopen(request).read()
1551 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1552 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1555 # Extract media URL from playlist XML
1556 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1558 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1560 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1561 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1564 # Process video information
1565 self._downloader.process_info({
1566 'id': video_id.decode('utf-8'),
1568 'uploader': video_uploader,
1569 'upload_date': u'NA',
1570 'title': video_title,
1571 'stitle': simple_title,
1572 'ext': video_extension.decode('utf-8'),
1573 'thumbnail': video_thumbnail.decode('utf-8'),
1574 'description': video_description,
1575 'thumbnail': video_thumbnail,
1576 'description': video_description,
1579 except UnavailableVideoError:
1580 self._downloader.trouble(u'ERROR: unable to download video')
1583 class GenericIE(InfoExtractor):
1584 """Generic last-resort information extractor."""
1586 def __init__(self, downloader=None):
1587 InfoExtractor.__init__(self, downloader)
1593 def report_download_webpage(self, video_id):
1594 """Report webpage download."""
1595 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1596 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1598 def report_extraction(self, video_id):
1599 """Report information extraction."""
1600 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1602 def _real_initialize(self):
1605 def _real_extract(self, url):
1606 # At this point we have a new video
1607 self._downloader.increment_downloads()
1609 video_id = url.split('/')[-1]
1610 request = urllib2.Request(url)
1612 self.report_download_webpage(video_id)
1613 webpage = urllib2.urlopen(request).read()
1614 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1615 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1617 except ValueError, err:
1618 # since this is the last-resort InfoExtractor, if
1619 # this error is thrown, it'll be thrown here
1620 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1623 self.report_extraction(video_id)
1624 # Start with something easy: JW Player in SWFObject
1625 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1627 # Broaden the search a little bit
1628 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1630 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1633 # It's possible that one of the regexes
1634 # matched, but returned an empty group:
1635 if mobj.group(1) is None:
1636 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1639 video_url = urllib.unquote(mobj.group(1))
1640 video_id = os.path.basename(video_url)
1642 # here's a fun little line of code for you:
1643 video_extension = os.path.splitext(video_id)[1][1:]
1644 video_id = os.path.splitext(video_id)[0]
1646 # it's tempting to parse this further, but you would
1647 # have to take into account all the variations like
1648 # Video Title - Site Name
1649 # Site Name | Video Title
1650 # Video Title - Tagline | Site Name
1651 # and so on and so forth; it's just not practical
1652 mobj = re.search(r'<title>(.*)</title>', webpage)
1654 self._downloader.trouble(u'ERROR: unable to extract title')
1656 video_title = mobj.group(1).decode('utf-8')
1657 video_title = sanitize_title(video_title)
1658 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1660 # video uploader is domain name
1661 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1663 self._downloader.trouble(u'ERROR: unable to extract title')
1665 video_uploader = mobj.group(1).decode('utf-8')
1668 # Process video information
1669 self._downloader.process_info({
1670 'id': video_id.decode('utf-8'),
1671 'url': video_url.decode('utf-8'),
1672 'uploader': video_uploader,
1673 'upload_date': u'NA',
1674 'title': video_title,
1675 'stitle': simple_title,
1676 'ext': video_extension.decode('utf-8'),
1680 except UnavailableVideoError, err:
1681 self._downloader.trouble(u'ERROR: unable to download video')
1684 class YoutubeSearchIE(InfoExtractor):
1685 """Information Extractor for YouTube search queries."""
1686 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1687 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1688 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1689 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1691 _max_youtube_results = 1000
1693 def __init__(self, youtube_ie, downloader=None):
1694 InfoExtractor.__init__(self, downloader)
1695 self._youtube_ie = youtube_ie
1699 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1701 def report_download_page(self, query, pagenum):
1702 """Report attempt to download playlist page with given number."""
1703 query = query.decode(preferredencoding())
1704 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1706 def _real_initialize(self):
1707 self._youtube_ie.initialize()
1709 def _real_extract(self, query):
1710 mobj = re.match(self._VALID_QUERY, query)
1712 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1715 prefix, query = query.split(':')
1717 query = query.encode('utf-8')
1719 self._download_n_results(query, 1)
1721 elif prefix == 'all':
1722 self._download_n_results(query, self._max_youtube_results)
1728 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1730 elif n > self._max_youtube_results:
1731 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1732 n = self._max_youtube_results
1733 self._download_n_results(query, n)
1735 except ValueError: # parsing prefix as integer fails
1736 self._download_n_results(query, 1)
1739 def _download_n_results(self, query, n):
1740 """Downloads a specified number of results for a query"""
1743 already_seen = set()
1747 self.report_download_page(query, pagenum)
1748 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1749 request = urllib2.Request(result_url, None, std_headers)
1751 page = urllib2.urlopen(request).read()
1752 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1753 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1756 # Extract video identifiers
1757 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1758 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1759 if video_id not in already_seen:
1760 video_ids.append(video_id)
1761 already_seen.add(video_id)
1762 if len(video_ids) == n:
1763 # Specified n videos reached
1764 for id in video_ids:
1765 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1768 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1769 for id in video_ids:
1770 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1773 pagenum = pagenum + 1
1775 class GoogleSearchIE(InfoExtractor):
1776 """Information Extractor for Google Video search queries."""
1777 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1778 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1779 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1780 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1782 _max_google_results = 1000
1784 def __init__(self, google_ie, downloader=None):
1785 InfoExtractor.__init__(self, downloader)
1786 self._google_ie = google_ie
1790 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1792 def report_download_page(self, query, pagenum):
1793 """Report attempt to download playlist page with given number."""
1794 query = query.decode(preferredencoding())
1795 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1797 def _real_initialize(self):
1798 self._google_ie.initialize()
1800 def _real_extract(self, query):
1801 mobj = re.match(self._VALID_QUERY, query)
1803 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1806 prefix, query = query.split(':')
1808 query = query.encode('utf-8')
1810 self._download_n_results(query, 1)
1812 elif prefix == 'all':
1813 self._download_n_results(query, self._max_google_results)
1819 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1821 elif n > self._max_google_results:
1822 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1823 n = self._max_google_results
1824 self._download_n_results(query, n)
1826 except ValueError: # parsing prefix as integer fails
1827 self._download_n_results(query, 1)
1830 def _download_n_results(self, query, n):
1831 """Downloads a specified number of results for a query"""
1834 already_seen = set()
1838 self.report_download_page(query, pagenum)
1839 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1840 request = urllib2.Request(result_url, None, std_headers)
1842 page = urllib2.urlopen(request).read()
1843 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1844 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1847 # Extract video identifiers
1848 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1849 video_id = mobj.group(1)
1850 if video_id not in already_seen:
1851 video_ids.append(video_id)
1852 already_seen.add(video_id)
1853 if len(video_ids) == n:
1854 # Specified n videos reached
1855 for id in video_ids:
1856 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1859 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1860 for id in video_ids:
1861 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1864 pagenum = pagenum + 1
1866 class YahooSearchIE(InfoExtractor):
1867 """Information Extractor for Yahoo! Video search queries."""
1868 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1869 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1870 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1871 _MORE_PAGES_INDICATOR = r'\s*Next'
1873 _max_yahoo_results = 1000
1875 def __init__(self, yahoo_ie, downloader=None):
1876 InfoExtractor.__init__(self, downloader)
1877 self._yahoo_ie = yahoo_ie
1881 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1883 def report_download_page(self, query, pagenum):
1884 """Report attempt to download playlist page with given number."""
1885 query = query.decode(preferredencoding())
1886 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1888 def _real_initialize(self):
1889 self._yahoo_ie.initialize()
1891 def _real_extract(self, query):
1892 mobj = re.match(self._VALID_QUERY, query)
1894 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1897 prefix, query = query.split(':')
1899 query = query.encode('utf-8')
1901 self._download_n_results(query, 1)
1903 elif prefix == 'all':
1904 self._download_n_results(query, self._max_yahoo_results)
1910 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1912 elif n > self._max_yahoo_results:
1913 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1914 n = self._max_yahoo_results
1915 self._download_n_results(query, n)
1917 except ValueError: # parsing prefix as integer fails
1918 self._download_n_results(query, 1)
1921 def _download_n_results(self, query, n):
1922 """Downloads a specified number of results for a query"""
1925 already_seen = set()
1929 self.report_download_page(query, pagenum)
1930 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1931 request = urllib2.Request(result_url, None, std_headers)
1933 page = urllib2.urlopen(request).read()
1934 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1938 # Extract video identifiers
1939 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940 video_id = mobj.group(1)
1941 if video_id not in already_seen:
1942 video_ids.append(video_id)
1943 already_seen.add(video_id)
1944 if len(video_ids) == n:
1945 # Specified n videos reached
1946 for id in video_ids:
1947 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1950 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1951 for id in video_ids:
1952 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1955 pagenum = pagenum + 1
1957 class YoutubePlaylistIE(InfoExtractor):
1958 """Information Extractor for YouTube playlists."""
1960 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1961 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1962 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1963 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1966 def __init__(self, youtube_ie, downloader=None):
1967 InfoExtractor.__init__(self, downloader)
1968 self._youtube_ie = youtube_ie
1972 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1974 def report_download_page(self, playlist_id, pagenum):
1975 """Report attempt to download playlist page with given number."""
1976 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1978 def _real_initialize(self):
1979 self._youtube_ie.initialize()
1981 def _real_extract(self, url):
1982 # Extract playlist id
1983 mobj = re.match(self._VALID_URL, url)
1985 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1988 # Download playlist pages
1989 playlist_id = mobj.group(1)
1994 self.report_download_page(playlist_id, pagenum)
1995 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1997 page = urllib2.urlopen(request).read()
1998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2002 # Extract video identifiers
2004 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2005 if mobj.group(1) not in ids_in_page:
2006 ids_in_page.append(mobj.group(1))
2007 video_ids.extend(ids_in_page)
2009 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2011 pagenum = pagenum + 1
2013 playliststart = self._downloader.params.get('playliststart', 1) - 1
2014 playlistend = self._downloader.params.get('playlistend', -1)
2015 video_ids = video_ids[playliststart:playlistend]
2017 for id in video_ids:
2018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2021 class YoutubeUserIE(InfoExtractor):
2022 """Information Extractor for YouTube users."""
2024 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2025 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2026 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2029 def __init__(self, youtube_ie, downloader=None):
2030 InfoExtractor.__init__(self, downloader)
2031 self._youtube_ie = youtube_ie
2035 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2037 def report_download_page(self, username):
2038 """Report attempt to download user page."""
2039 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2041 def _real_initialize(self):
2042 self._youtube_ie.initialize()
2044 def _real_extract(self, url):
2046 mobj = re.match(self._VALID_URL, url)
2048 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2051 # Download user page
2052 username = mobj.group(1)
2056 self.report_download_page(username)
2057 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2059 page = urllib2.urlopen(request).read()
2060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2064 # Extract video identifiers
2067 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2068 if mobj.group(1) not in ids_in_page:
2069 ids_in_page.append(mobj.group(1))
2070 video_ids.extend(ids_in_page)
2072 playliststart = self._downloader.params.get('playliststart', 1) - 1
2073 playlistend = self._downloader.params.get('playlistend', -1)
2074 video_ids = video_ids[playliststart:playlistend]
2076 for id in video_ids:
2077 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2080 class DepositFilesIE(InfoExtractor):
2081 """Information extractor for depositfiles.com"""
2083 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2085 def __init__(self, downloader=None):
2086 InfoExtractor.__init__(self, downloader)
2090 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2092 def report_download_webpage(self, file_id):
2093 """Report webpage download."""
2094 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2096 def report_extraction(self, file_id):
2097 """Report information extraction."""
2098 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2100 def _real_initialize(self):
2103 def _real_extract(self, url):
2104 # At this point we have a new file
2105 self._downloader.increment_downloads()
2107 file_id = url.split('/')[-1]
2108 # Rebuild url in english locale
2109 url = 'http://depositfiles.com/en/files/' + file_id
2111 # Retrieve file webpage with 'Free download' button pressed
2112 free_download_indication = { 'gateway_result' : '1' }
2113 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2115 self.report_download_webpage(file_id)
2116 webpage = urllib2.urlopen(request).read()
2117 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2121 # Search for the real file URL
2122 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2123 if (mobj is None) or (mobj.group(1) is None):
2124 # Try to figure out reason of the error.
2125 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2126 if (mobj is not None) and (mobj.group(1) is not None):
2127 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2128 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2130 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2133 file_url = mobj.group(1)
2134 file_extension = os.path.splitext(file_url)[1][1:]
2136 # Search for file title
2137 mobj = re.search(r'<b title="(.*?)">', webpage)
2139 self._downloader.trouble(u'ERROR: unable to extract title')
2141 file_title = mobj.group(1).decode('utf-8')
2144 # Process file information
2145 self._downloader.process_info({
2146 'id': file_id.decode('utf-8'),
2147 'url': file_url.decode('utf-8'),
2149 'upload_date': u'NA',
2150 'title': file_title,
2151 'stitle': file_title,
2152 'ext': file_extension.decode('utf-8'),
2156 except UnavailableVideoError, err:
2157 self._downloader.trouble(u'ERROR: unable to download file')
2159 class PostProcessor(object):
2160 """Post Processor class.
2162 PostProcessor objects can be added to downloaders with their
2163 add_post_processor() method. When the downloader has finished a
2164 successful download, it will take its internal chain of PostProcessors
2165 and start calling the run() method on each one of them, first with
2166 an initial argument and then with the returned value of the previous
2169 The chain will be stopped if one of them ever returns None or the end
2170 of the chain is reached.
2172 PostProcessor objects follow a "mutual registration" process similar
2173 to InfoExtractor objects.
2178 def __init__(self, downloader=None):
2179 self._downloader = downloader
2181 def set_downloader(self, downloader):
2182 """Sets the downloader for this PP."""
2183 self._downloader = downloader
2185 def run(self, information):
2186 """Run the PostProcessor.
2188 The "information" argument is a dictionary like the ones
2189 composed by InfoExtractors. The only difference is that this
2190 one has an extra field called "filepath" that points to the
2193 When this method returns None, the postprocessing chain is
2194 stopped. However, this method may return an information
2195 dictionary that will be passed to the next postprocessing
2196 object in the chain. It can be the one it received after
2197 changing some fields.
2199 In addition, this method may raise a PostProcessingError
2200 exception that will be taken into account by the downloader
2203 return information # by default, do nothing
2205 ### MAIN PROGRAM ###
2206 if __name__ == '__main__':
2208 # Modules needed only when running the main program
2212 # Function to update the program file with the latest version from bitbucket.org
2213 def update_self(downloader, filename):
2214 # Note: downloader only used for options
2215 if not os.access (filename, os.W_OK):
2216 sys.exit('ERROR: no write permissions on %s' % filename)
2218 downloader.to_screen('Updating to latest stable version...')
2219 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2220 latest_version = urllib.urlopen(latest_url).read().strip()
2221 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2222 newcontent = urllib.urlopen(prog_url).read()
2223 stream = open(filename, 'w')
2224 stream.write(newcontent)
2226 downloader.to_screen('Updated to version %s' % latest_version)
2228 # Parse command line
2229 parser = optparse.OptionParser(
2230 usage='Usage: %prog [options] url...',
2231 version='2010.11.19',
2232 conflict_handler='resolve',
2235 parser.add_option('-h', '--help',
2236 action='help', help='print this help text and exit')
2237 parser.add_option('-v', '--version',
2238 action='version', help='print program version and exit')
2239 parser.add_option('-U', '--update',
2240 action='store_true', dest='update_self', help='update this program to latest stable version')
2241 parser.add_option('-i', '--ignore-errors',
2242 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2243 parser.add_option('-r', '--rate-limit',
2244 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2245 parser.add_option('-R', '--retries',
2246 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2247 parser.add_option('--playlist-start',
2248 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2249 parser.add_option('--playlist-end',
2250 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2252 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2253 authentication.add_option('-u', '--username',
2254 dest='username', metavar='USERNAME', help='account username')
2255 authentication.add_option('-p', '--password',
2256 dest='password', metavar='PASSWORD', help='account password')
2257 authentication.add_option('-n', '--netrc',
2258 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2259 parser.add_option_group(authentication)
2261 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2262 video_format.add_option('-f', '--format',
2263 action='store', dest='format', metavar='FORMAT', help='video format code')
2264 video_format.add_option('-m', '--mobile-version',
2265 action='store_const', dest='format', help='alias for -f 17', const='17')
2266 video_format.add_option('--all-formats',
2267 action='store_const', dest='format', help='download all available video formats', const='-1')
2268 video_format.add_option('--max-quality',
2269 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2270 video_format.add_option('-b', '--best-quality',
2271 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2272 parser.add_option_group(video_format)
2274 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2275 verbosity.add_option('-q', '--quiet',
2276 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2277 verbosity.add_option('-s', '--simulate',
2278 action='store_true', dest='simulate', help='do not download video', default=False)
2279 verbosity.add_option('-g', '--get-url',
2280 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2281 verbosity.add_option('-e', '--get-title',
2282 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2283 verbosity.add_option('--get-thumbnail',
2284 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2285 verbosity.add_option('--get-description',
2286 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2287 verbosity.add_option('--no-progress',
2288 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2289 parser.add_option_group(verbosity)
2291 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2292 filesystem.add_option('-t', '--title',
2293 action='store_true', dest='usetitle', help='use title in file name', default=False)
2294 filesystem.add_option('-l', '--literal',
2295 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2296 filesystem.add_option('-A', '--auto-number',
2297 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2298 filesystem.add_option('-o', '--output',
2299 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2300 filesystem.add_option('-a', '--batch-file',
2301 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2302 filesystem.add_option('-w', '--no-overwrites',
2303 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2304 filesystem.add_option('-c', '--continue',
2305 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2306 filesystem.add_option('--cookies',
2307 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2308 parser.add_option_group(filesystem)
2310 (opts, args) = parser.parse_args()
2312 # Open appropriate CookieJar
2313 if opts.cookiefile is None:
2314 jar = cookielib.CookieJar()
2317 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2318 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2320 except (IOError, OSError), err:
2321 sys.exit(u'ERROR: unable to open cookie file')
2323 # General configuration
2324 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2325 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2326 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2327 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2329 # Batch file verification
2331 if opts.batchfile is not None:
2333 if opts.batchfile == '-':
2336 batchfd = open(opts.batchfile, 'r')
2337 batchurls = batchfd.readlines()
2338 batchurls = [x.strip() for x in batchurls]
2339 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2341 sys.exit(u'ERROR: batch file could not be read')
2342 all_urls = batchurls + args
2344 # Conflicting, missing and erroneous options
2345 if opts.bestquality:
2346 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2347 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2348 parser.error(u'using .netrc conflicts with giving username/password')
2349 if opts.password is not None and opts.username is None:
2350 parser.error(u'account username missing')
2351 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2352 parser.error(u'using output template conflicts with using title, literal title or auto number')
2353 if opts.usetitle and opts.useliteral:
2354 parser.error(u'using title conflicts with using literal title')
2355 if opts.username is not None and opts.password is None:
2356 opts.password = getpass.getpass(u'Type account password and press return:')
2357 if opts.ratelimit is not None:
2358 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2359 if numeric_limit is None:
2360 parser.error(u'invalid rate limit specified')
2361 opts.ratelimit = numeric_limit
2362 if opts.retries is not None:
2364 opts.retries = long(opts.retries)
2365 except (TypeError, ValueError), err:
2366 parser.error(u'invalid retry count specified')
2368 opts.playliststart = long(opts.playliststart)
2369 if opts.playliststart <= 0:
2371 except (TypeError, ValueError), err:
2372 parser.error(u'invalid playlist start number specified')
2374 opts.playlistend = long(opts.playlistend)
2375 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2377 except (TypeError, ValueError), err:
2378 parser.error(u'invalid playlist end number specified')
2380 # Information extractors
2381 youtube_ie = YoutubeIE()
2382 metacafe_ie = MetacafeIE(youtube_ie)
2383 dailymotion_ie = DailymotionIE()
2384 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2385 youtube_user_ie = YoutubeUserIE(youtube_ie)
2386 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2387 google_ie = GoogleIE()
2388 google_search_ie = GoogleSearchIE(google_ie)
2389 photobucket_ie = PhotobucketIE()
2390 yahoo_ie = YahooIE()
2391 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2392 deposit_files_ie = DepositFilesIE()
2393 generic_ie = GenericIE()
2396 fd = FileDownloader({
2397 'usenetrc': opts.usenetrc,
2398 'username': opts.username,
2399 'password': opts.password,
2400 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2401 'forceurl': opts.geturl,
2402 'forcetitle': opts.gettitle,
2403 'forcethumbnail': opts.getthumbnail,
2404 'forcedescription': opts.getdescription,
2405 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2406 'format': opts.format,
2407 'format_limit': opts.format_limit,
2408 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2409 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2410 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2411 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2412 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2413 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2414 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2415 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2416 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2417 or u'%(id)s.%(ext)s'),
2418 'ignoreerrors': opts.ignoreerrors,
2419 'ratelimit': opts.ratelimit,
2420 'nooverwrites': opts.nooverwrites,
2421 'retries': opts.retries,
2422 'continuedl': opts.continue_dl,
2423 'noprogress': opts.noprogress,
2424 'playliststart': opts.playliststart,
2425 'playlistend': opts.playlistend,
2426 'logtostderr': opts.outtmpl == '-',
2428 fd.add_info_extractor(youtube_search_ie)
2429 fd.add_info_extractor(youtube_pl_ie)
2430 fd.add_info_extractor(youtube_user_ie)
2431 fd.add_info_extractor(metacafe_ie)
2432 fd.add_info_extractor(dailymotion_ie)
2433 fd.add_info_extractor(youtube_ie)
2434 fd.add_info_extractor(google_ie)
2435 fd.add_info_extractor(google_search_ie)
2436 fd.add_info_extractor(photobucket_ie)
2437 fd.add_info_extractor(yahoo_ie)
2438 fd.add_info_extractor(yahoo_search_ie)
2439 fd.add_info_extractor(deposit_files_ie)
2441 # This must come last since it's the
2442 # fallback if none of the others work
2443 fd.add_info_extractor(generic_ie)
2446 if opts.update_self:
2447 update_self(fd, sys.argv[0])
2450 if len(all_urls) < 1:
2451 if not opts.update_self:
2452 parser.error(u'you must provide at least one URL')
2455 retcode = fd.download(all_urls)
2457 # Dump cookie jar if requested
2458 if opts.cookiefile is not None:
2461 except (IOError, OSError), err:
2462 sys.exit(u'ERROR: unable to save cookie jar')
2466 except DownloadError:
2468 except SameFileError:
2469 sys.exit(u'ERROR: fixed output name but more than one file to download')
2470 except KeyboardInterrupt:
2471 sys.exit(u'\nERROR: Interrupted by user')