2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.14'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5
70 pass # Not officially supported, but let it slip
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 logtostderr: Log messages to stderr instead of stdout.
442 consoletitle: Display progress in console window's titlebar.
443 nopart: Do not use temporary .part files.
444 updatetime: Use the Last-modified header to set output file timestamps.
445 writedescription: Write the video description to a .description file
446 writeinfojson: Write the video description to a .info.json file
452 _download_retcode = None
453 _num_downloads = None
456 def __init__(self, params):
457 """Create a FileDownloader object with the given options."""
460 self._download_retcode = 0
461 self._num_downloads = 0
462 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
466 def format_bytes(bytes):
469 if type(bytes) is str:
474 exponent = long(math.log(bytes, 1024.0))
475 suffix = 'bkMGTPEZY'[exponent]
476 converted = float(bytes) / float(1024 ** exponent)
477 return '%.2f%s' % (converted, suffix)
480 def calc_percent(byte_counter, data_len):
483 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486 def calc_eta(start, now, total, current):
490 if current == 0 or dif < 0.001: # One millisecond
492 rate = float(current) / dif
493 eta = long((float(total) - float(current)) / rate)
494 (eta_mins, eta_secs) = divmod(eta, 60)
497 return '%02d:%02d' % (eta_mins, eta_secs)
500 def calc_speed(start, now, bytes):
502 if bytes == 0 or dif < 0.001: # One millisecond
503 return '%10s' % '---b/s'
504 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507 def best_block_size(elapsed_time, bytes):
508 new_min = max(bytes / 2.0, 1.0)
509 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
510 if elapsed_time < 0.001:
512 rate = bytes / elapsed_time
520 def parse_bytes(bytestr):
521 """Parse a string indicating a byte quantity into a long integer."""
522 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 number = float(matchobj.group(1))
526 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
527 return long(round(number * multiplier))
529 def add_info_extractor(self, ie):
530 """Add an InfoExtractor object to the end of the list."""
532 ie.set_downloader(self)
534 def add_post_processor(self, pp):
535 """Add a PostProcessor object to the end of the chain."""
537 pp.set_downloader(self)
539 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
540 """Print message to stdout if not in quiet mode."""
542 if not self.params.get('quiet', False):
543 terminator = [u'\n', u''][skip_eol]
544 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
545 self._screen_file.flush()
546 except (UnicodeEncodeError), err:
547 if not ignore_encoding_errors:
550 def to_stderr(self, message):
551 """Print message to stderr."""
552 print >>sys.stderr, message.encode(preferredencoding())
554 def to_cons_title(self, message):
555 """Set console/terminal window title to message."""
556 if not self.params.get('consoletitle', False):
558 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
559 # c_wchar_p() might not be necessary if `message` is
560 # already of type unicode()
561 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
562 elif 'TERM' in os.environ:
563 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
565 def fixed_template(self):
566 """Checks if the output template is fixed."""
567 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
569 def trouble(self, message=None):
570 """Determine action to take when a download problem appears.
572 Depending on if the downloader has been configured to ignore
573 download errors or not, this method may throw an exception or
574 not when errors are found, after printing the message.
576 if message is not None:
577 self.to_stderr(message)
578 if not self.params.get('ignoreerrors', False):
579 raise DownloadError(message)
580 self._download_retcode = 1
582 def slow_down(self, start_time, byte_counter):
583 """Sleep if the download speed is over the rate limit."""
584 rate_limit = self.params.get('ratelimit', None)
585 if rate_limit is None or byte_counter == 0:
588 elapsed = now - start_time
591 speed = float(byte_counter) / elapsed
592 if speed > rate_limit:
593 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
595 def temp_name(self, filename):
596 """Returns a temporary filename for the given filename."""
597 if self.params.get('nopart', False) or filename == u'-' or \
598 (os.path.exists(filename) and not os.path.isfile(filename)):
600 return filename + u'.part'
602 def undo_temp_name(self, filename):
603 if filename.endswith(u'.part'):
604 return filename[:-len(u'.part')]
607 def try_rename(self, old_filename, new_filename):
609 if old_filename == new_filename:
611 os.rename(old_filename, new_filename)
612 except (IOError, OSError), err:
613 self.trouble(u'ERROR: unable to rename file')
615 def try_utime(self, filename, last_modified_hdr):
616 """Try to set the last-modified time of the given file."""
617 if last_modified_hdr is None:
619 if not os.path.isfile(filename):
621 timestr = last_modified_hdr
624 filetime = timeconvert(timestr)
628 os.utime(filename, (time.time(), filetime))
632 def report_writedescription(self, descfn):
633 """ Report that the description file is being written """
634 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
636 def report_writeinfojson(self, infofn):
637 """ Report that the metadata file has been written """
638 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
640 def report_destination(self, filename):
641 """Report destination filename."""
642 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
644 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
645 """Report download progress."""
646 if self.params.get('noprogress', False):
648 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
649 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
650 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
651 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
653 def report_resuming_byte(self, resume_len):
654 """Report attempt to resume at given byte."""
655 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
657 def report_retry(self, count, retries):
658 """Report retry in case of HTTP error 5xx"""
659 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
661 def report_file_already_downloaded(self, file_name):
662 """Report file has already been fully downloaded."""
664 self.to_screen(u'[download] %s has already been downloaded' % file_name)
665 except (UnicodeEncodeError), err:
666 self.to_screen(u'[download] The file has already been downloaded')
668 def report_unable_to_resume(self):
669 """Report it was impossible to resume download."""
670 self.to_screen(u'[download] Unable to resume')
672 def report_finish(self):
673 """Report download finished."""
674 if self.params.get('noprogress', False):
675 self.to_screen(u'[download] Download completed')
679 def increment_downloads(self):
680 """Increment the ordinal that assigns a number to each file."""
681 self._num_downloads += 1
683 def prepare_filename(self, info_dict):
684 """Generate the output filename."""
686 template_dict = dict(info_dict)
687 template_dict['epoch'] = unicode(long(time.time()))
688 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
689 filename = self.params['outtmpl'] % template_dict
691 except (ValueError, KeyError), err:
692 self.trouble(u'ERROR: invalid system charset or erroneous output template')
695 def process_info(self, info_dict):
696 """Process a single dictionary returned by an InfoExtractor."""
697 filename = self.prepare_filename(info_dict)
698 # Do nothing else if in simulate mode
699 if self.params.get('simulate', False):
701 if self.params.get('forcetitle', False):
702 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
703 if self.params.get('forceurl', False):
704 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
706 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcedescription', False) and 'description' in info_dict:
708 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcefilename', False) and filename is not None:
710 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
716 if self.params.get('nooverwrites', False) and os.path.exists(filename):
717 self.to_stderr(u'WARNING: file exists and will be skipped')
721 dn = os.path.dirname(filename)
722 if dn != '' and not os.path.exists(dn):
724 except (OSError, IOError), err:
725 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
728 if self.params.get('writedescription', False):
730 descfn = filename + '.description'
731 self.report_writedescription(descfn)
732 descfile = open(descfn, 'wb')
734 descfile.write(info_dict['description'].encode('utf-8'))
737 except (OSError, IOError):
738 self.trouble(u'ERROR: Cannot write description file ' + descfn)
741 if self.params.get('writeinfojson', False):
742 infofn = filename + '.info.json'
743 self.report_writeinfojson(infofn)
746 except (NameError,AttributeError):
747 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
750 infof = open(infofn, 'wb')
752 json.dump(info_dict, infof)
755 except (OSError, IOError):
756 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
760 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
761 except (OSError, IOError), err:
762 raise UnavailableVideoError
763 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
764 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
766 except (ContentTooShortError, ), err:
767 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
772 self.post_process(filename, info_dict)
773 except (PostProcessingError), err:
774 self.trouble(u'ERROR: postprocessing: %s' % str(err))
777 def download(self, url_list):
778 """Download a given list of URLs."""
779 if len(url_list) > 1 and self.fixed_template():
780 raise SameFileError(self.params['outtmpl'])
783 suitable_found = False
785 # Go to next InfoExtractor if not suitable
786 if not ie.suitable(url):
789 # Suitable InfoExtractor found
790 suitable_found = True
792 # Extract information from URL and process it
795 # Suitable InfoExtractor had been found; go to next URL
798 if not suitable_found:
799 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
801 return self._download_retcode
803 def post_process(self, filename, ie_info):
804 """Run the postprocessing chain on the given file."""
806 info['filepath'] = filename
812 def _download_with_rtmpdump(self, filename, url, player_url):
813 self.report_destination(filename)
814 tmpfilename = self.temp_name(filename)
816 # Check for rtmpdump first
818 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
819 except (OSError, IOError):
820 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
823 # Download using rtmpdump. rtmpdump returns exit code 2 when
824 # the connection was interrumpted and resuming appears to be
825 # possible. This is part of rtmpdump's normal usage, AFAIK.
826 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
827 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
828 while retval == 2 or retval == 1:
829 prevsize = os.path.getsize(tmpfilename)
830 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
831 time.sleep(5.0) # This seems to be needed
832 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
833 cursize = os.path.getsize(tmpfilename)
834 if prevsize == cursize and retval == 1:
836 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
837 if prevsize == cursize and retval == 2 and cursize > 1024:
838 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
842 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
843 self.try_rename(tmpfilename, filename)
846 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
849 def _do_download(self, filename, url, player_url):
850 # Check file already present
851 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
852 self.report_file_already_downloaded(filename)
855 # Attempt to download using rtmpdump
856 if url.startswith('rtmp'):
857 return self._download_with_rtmpdump(filename, url, player_url)
859 tmpfilename = self.temp_name(filename)
863 # Do not include the Accept-Encoding header
864 headers = {'Youtubedl-no-compression': 'True'}
865 basic_request = urllib2.Request(url, None, headers)
866 request = urllib2.Request(url, None, headers)
868 # Establish possible resume length
869 if os.path.isfile(tmpfilename):
870 resume_len = os.path.getsize(tmpfilename)
874 # Request parameters in case of being able to resume
875 if self.params.get('continuedl', False) and resume_len != 0:
876 self.report_resuming_byte(resume_len)
877 request.add_header('Range', 'bytes=%d-' % resume_len)
881 retries = self.params.get('retries', 0)
882 while count <= retries:
883 # Establish connection
885 data = urllib2.urlopen(request)
887 except (urllib2.HTTPError, ), err:
888 if (err.code < 500 or err.code >= 600) and err.code != 416:
889 # Unexpected HTTP error
891 elif err.code == 416:
892 # Unable to resume (requested range not satisfiable)
894 # Open the connection again without the range header
895 data = urllib2.urlopen(basic_request)
896 content_length = data.info()['Content-Length']
897 except (urllib2.HTTPError, ), err:
898 if err.code < 500 or err.code >= 600:
901 # Examine the reported length
902 if (content_length is not None and
903 (resume_len - 100 < long(content_length) < resume_len + 100)):
904 # The file had already been fully downloaded.
905 # Explanation to the above condition: in issue #175 it was revealed that
906 # YouTube sometimes adds or removes a few bytes from the end of the file,
907 # changing the file size slightly and causing problems for some users. So
908 # I decided to implement a suggested change and consider the file
909 # completely downloaded if the file size differs less than 100 bytes from
910 # the one in the hard drive.
911 self.report_file_already_downloaded(filename)
912 self.try_rename(tmpfilename, filename)
915 # The length does not match, we start the download over
916 self.report_unable_to_resume()
922 self.report_retry(count, retries)
925 self.trouble(u'ERROR: giving up after %s retries' % retries)
928 data_len = data.info().get('Content-length', None)
929 if data_len is not None:
930 data_len = long(data_len) + resume_len
931 data_len_str = self.format_bytes(data_len)
932 byte_counter = 0 + resume_len
938 data_block = data.read(block_size)
940 if len(data_block) == 0:
942 byte_counter += len(data_block)
944 # Open file just in time
947 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
948 assert stream is not None
949 filename = self.undo_temp_name(tmpfilename)
950 self.report_destination(filename)
951 except (OSError, IOError), err:
952 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
955 stream.write(data_block)
956 except (IOError, OSError), err:
957 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
959 block_size = self.best_block_size(after - before, len(data_block))
962 percent_str = self.calc_percent(byte_counter, data_len)
963 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
964 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
965 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
968 self.slow_down(start, byte_counter - resume_len)
971 self.trouble(u'\nERROR: Did not get any data blocks')
975 if data_len is not None and byte_counter != data_len:
976 raise ContentTooShortError(byte_counter, long(data_len))
977 self.try_rename(tmpfilename, filename)
979 # Update file modification time
980 if self.params.get('updatetime', True):
981 self.try_utime(filename, data.info().get('last-modified', None))
986 class InfoExtractor(object):
987 """Information Extractor class.
989 Information extractors are the classes that, given a URL, extract
990 information from the video (or videos) the URL refers to. This
991 information includes the real video URL, the video title and simplified
992 title, author and others. The information is stored in a dictionary
993 which is then passed to the FileDownloader. The FileDownloader
994 processes this information possibly downloading the video to the file
995 system, among other possible outcomes. The dictionaries must include
996 the following fields:
998 id: Video identifier.
999 url: Final video URL.
1000 uploader: Nickname of the video uploader.
1001 title: Literal title.
1002 stitle: Simplified title.
1003 ext: Video filename extension.
1004 format: Video format.
1005 player_url: SWF Player URL (may be None).
1007 The following fields are optional. Their primary purpose is to allow
1008 youtube-dl to serve as the backend for a video search function, such
1009 as the one in youtube2mp3. They are only used when their respective
1010 forced printing functions are called:
1012 thumbnail: Full URL to a video thumbnail image.
1013 description: One-line video description.
1015 Subclasses of this one should re-define the _real_initialize() and
1016 _real_extract() methods, as well as the suitable() static method.
1017 Probably, they should also be instantiated and added to the main
1024 def __init__(self, downloader=None):
1025 """Constructor. Receives an optional downloader."""
1027 self.set_downloader(downloader)
1031 """Receives a URL and returns True if suitable for this IE."""
1034 def initialize(self):
1035 """Initializes an instance (authentication, etc)."""
1037 self._real_initialize()
1040 def extract(self, url):
1041 """Extracts URL information and returns it in list of dicts."""
1043 return self._real_extract(url)
1045 def set_downloader(self, downloader):
1046 """Sets the downloader for this IE."""
1047 self._downloader = downloader
1049 def _real_initialize(self):
1050 """Real initialization process. Redefine in subclasses."""
1053 def _real_extract(self, url):
1054 """Real extraction process. Redefine in subclasses."""
1058 class YoutubeIE(InfoExtractor):
1059 """Information extractor for youtube.com."""
1061 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1062 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1063 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1064 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1065 _NETRC_MACHINE = 'youtube'
1066 # Listed in order of quality
1067 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1068 _video_extensions = {
1074 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1081 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1083 def report_lang(self):
1084 """Report attempt to set language."""
1085 self._downloader.to_screen(u'[youtube] Setting language')
1087 def report_login(self):
1088 """Report attempt to log in."""
1089 self._downloader.to_screen(u'[youtube] Logging in')
1091 def report_age_confirmation(self):
1092 """Report attempt to confirm age."""
1093 self._downloader.to_screen(u'[youtube] Confirming age')
1095 def report_video_webpage_download(self, video_id):
1096 """Report attempt to download video webpage."""
1097 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1099 def report_video_info_webpage_download(self, video_id):
1100 """Report attempt to download video info webpage."""
1101 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1103 def report_information_extraction(self, video_id):
1104 """Report attempt to extract video information."""
1105 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1107 def report_unavailable_format(self, video_id, format):
1108 """Report extracted video URL."""
1109 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1111 def report_rtmp_download(self):
1112 """Indicate the download will use the RTMP protocol."""
1113 self._downloader.to_screen(u'[youtube] RTMP download detected')
1115 def _real_initialize(self):
1116 if self._downloader is None:
1121 downloader_params = self._downloader.params
1123 # Attempt to use provided username and password or .netrc data
1124 if downloader_params.get('username', None) is not None:
1125 username = downloader_params['username']
1126 password = downloader_params['password']
1127 elif downloader_params.get('usenetrc', False):
1129 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1130 if info is not None:
1134 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1135 except (IOError, netrc.NetrcParseError), err:
1136 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1140 request = urllib2.Request(self._LANG_URL)
1143 urllib2.urlopen(request).read()
1144 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1145 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1148 # No authentication to be performed
1149 if username is None:
1154 'current_form': 'loginForm',
1156 'action_login': 'Log In',
1157 'username': username,
1158 'password': password,
1160 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1163 login_results = urllib2.urlopen(request).read()
1164 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1165 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1167 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1168 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1174 'action_confirm': 'Confirm',
1176 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1178 self.report_age_confirmation()
1179 age_results = urllib2.urlopen(request).read()
1180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1181 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1184 def _real_extract(self, url):
1185 # Extract video id from URL
1186 mobj = re.match(self._VALID_URL, url)
1188 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1190 video_id = mobj.group(2)
1193 self.report_video_webpage_download(video_id)
1194 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1196 video_webpage = urllib2.urlopen(request).read()
1197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1201 # Attempt to extract SWF player URL
1202 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1203 if mobj is not None:
1204 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1209 self.report_video_info_webpage_download(video_id)
1210 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1211 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1212 % (video_id, el_type))
1213 request = urllib2.Request(video_info_url)
1215 video_info_webpage = urllib2.urlopen(request).read()
1216 video_info = parse_qs(video_info_webpage)
1217 if 'token' in video_info:
1219 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1220 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1222 if 'token' not in video_info:
1223 if 'reason' in video_info:
1224 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1226 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1229 # Start extracting information
1230 self.report_information_extraction(video_id)
1233 if 'author' not in video_info:
1234 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1236 video_uploader = urllib.unquote_plus(video_info['author'][0])
1239 if 'title' not in video_info:
1240 self._downloader.trouble(u'ERROR: unable to extract video title')
1242 video_title = urllib.unquote_plus(video_info['title'][0])
1243 video_title = video_title.decode('utf-8')
1244 video_title = sanitize_title(video_title)
1247 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1248 simple_title = simple_title.strip(ur'_')
1251 if 'thumbnail_url' not in video_info:
1252 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1253 video_thumbnail = ''
1254 else: # don't panic if we can't find it
1255 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1259 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1260 if mobj is not None:
1261 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1262 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1263 for expression in format_expressions:
1265 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1273 video_description = u'No description available.'
1274 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1275 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1276 if mobj is not None:
1277 video_description = mobj.group(1).decode('utf-8')
1279 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1280 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1281 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1282 # TODO use another parser
1285 video_token = urllib.unquote_plus(video_info['token'][0])
1287 # Decide which formats to download
1288 req_format = self._downloader.params.get('format', None)
1290 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1291 self.report_rtmp_download()
1292 video_url_list = [(None, video_info['conn'][0])]
1293 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1294 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1295 url_data = [parse_qs(uds) for uds in url_data_strs]
1296 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1297 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1299 format_limit = self._downloader.params.get('format_limit', None)
1300 if format_limit is not None and format_limit in self._available_formats:
1301 format_list = self._available_formats[self._available_formats.index(format_limit):]
1303 format_list = self._available_formats
1304 existing_formats = [x for x in format_list if x in url_map]
1305 if len(existing_formats) == 0:
1306 self._downloader.trouble(u'ERROR: no known formats available for video')
1308 if req_format is None:
1309 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1310 elif req_format == '-1':
1311 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1314 if req_format not in url_map:
1315 self._downloader.trouble(u'ERROR: requested format not available')
1317 video_url_list = [(req_format, url_map[req_format])] # Specific format
1319 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1322 for format_param, video_real_url in video_url_list:
1323 # At this point we have a new video
1324 self._downloader.increment_downloads()
1327 video_extension = self._video_extensions.get(format_param, 'flv')
1330 # Process video information
1331 self._downloader.process_info({
1332 'id': video_id.decode('utf-8'),
1333 'url': video_real_url.decode('utf-8'),
1334 'uploader': video_uploader.decode('utf-8'),
1335 'upload_date': upload_date,
1336 'title': video_title,
1337 'stitle': simple_title,
1338 'ext': video_extension.decode('utf-8'),
1339 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1340 'thumbnail': video_thumbnail.decode('utf-8'),
1341 'description': video_description,
1342 'player_url': player_url,
1344 except UnavailableVideoError, err:
1345 self._downloader.trouble(u'\nERROR: unable to download video')
1348 class MetacafeIE(InfoExtractor):
1349 """Information Extractor for metacafe.com."""
1351 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1352 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1353 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1356 def __init__(self, youtube_ie, downloader=None):
1357 InfoExtractor.__init__(self, downloader)
1358 self._youtube_ie = youtube_ie
1362 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1364 def report_disclaimer(self):
1365 """Report disclaimer retrieval."""
1366 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1368 def report_age_confirmation(self):
1369 """Report attempt to confirm age."""
1370 self._downloader.to_screen(u'[metacafe] Confirming age')
1372 def report_download_webpage(self, video_id):
1373 """Report webpage download."""
1374 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1376 def report_extraction(self, video_id):
1377 """Report information extraction."""
1378 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1380 def _real_initialize(self):
1381 # Retrieve disclaimer
1382 request = urllib2.Request(self._DISCLAIMER)
1384 self.report_disclaimer()
1385 disclaimer = urllib2.urlopen(request).read()
1386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1387 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1393 'submit': "Continue - I'm over 18",
1395 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1397 self.report_age_confirmation()
1398 disclaimer = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1403 def _real_extract(self, url):
1404 # Extract id and simplified title from URL
1405 mobj = re.match(self._VALID_URL, url)
1407 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1410 video_id = mobj.group(1)
1412 # Check if video comes from YouTube
1413 mobj2 = re.match(r'^yt-(.*)$', video_id)
1414 if mobj2 is not None:
1415 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1418 # At this point we have a new video
1419 self._downloader.increment_downloads()
1421 simple_title = mobj.group(2).decode('utf-8')
1423 # Retrieve video webpage to extract further information
1424 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1426 self.report_download_webpage(video_id)
1427 webpage = urllib2.urlopen(request).read()
1428 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1429 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1432 # Extract URL, uploader and title from webpage
1433 self.report_extraction(video_id)
1434 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1435 if mobj is not None:
1436 mediaURL = urllib.unquote(mobj.group(1))
1437 video_extension = mediaURL[-3:]
1439 # Extract gdaKey if available
1440 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1442 video_url = mediaURL
1444 gdaKey = mobj.group(1)
1445 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1447 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1449 self._downloader.trouble(u'ERROR: unable to extract media URL')
1451 vardict = parse_qs(mobj.group(1))
1452 if 'mediaData' not in vardict:
1453 self._downloader.trouble(u'ERROR: unable to extract media URL')
1455 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1457 self._downloader.trouble(u'ERROR: unable to extract media URL')
1459 mediaURL = mobj.group(1).replace('\\/', '/')
1460 video_extension = mediaURL[-3:]
1461 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1463 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1465 self._downloader.trouble(u'ERROR: unable to extract title')
1467 video_title = mobj.group(1).decode('utf-8')
1468 video_title = sanitize_title(video_title)
1470 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1472 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1474 video_uploader = mobj.group(1)
1477 # Process video information
1478 self._downloader.process_info({
1479 'id': video_id.decode('utf-8'),
1480 'url': video_url.decode('utf-8'),
1481 'uploader': video_uploader.decode('utf-8'),
1482 'upload_date': u'NA',
1483 'title': video_title,
1484 'stitle': simple_title,
1485 'ext': video_extension.decode('utf-8'),
1489 except UnavailableVideoError:
1490 self._downloader.trouble(u'\nERROR: unable to download video')
1493 class DailymotionIE(InfoExtractor):
1494 """Information Extractor for Dailymotion"""
1496 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1498 def __init__(self, downloader=None):
1499 InfoExtractor.__init__(self, downloader)
1503 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1505 def report_download_webpage(self, video_id):
1506 """Report webpage download."""
1507 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1509 def report_extraction(self, video_id):
1510 """Report information extraction."""
1511 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1513 def _real_initialize(self):
1516 def _real_extract(self, url):
1517 # Extract id and simplified title from URL
1518 mobj = re.match(self._VALID_URL, url)
1520 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1523 # At this point we have a new video
1524 self._downloader.increment_downloads()
1525 video_id = mobj.group(1)
1527 simple_title = mobj.group(2).decode('utf-8')
1528 video_extension = 'flv'
1530 # Retrieve video webpage to extract further information
1531 request = urllib2.Request(url)
1532 request.add_header('Cookie', 'family_filter=off')
1534 self.report_download_webpage(video_id)
1535 webpage = urllib2.urlopen(request).read()
1536 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1537 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1540 # Extract URL, uploader and title from webpage
1541 self.report_extraction(video_id)
1542 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1544 self._downloader.trouble(u'ERROR: unable to extract media URL')
1546 sequence = urllib.unquote(mobj.group(1))
1547 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1549 self._downloader.trouble(u'ERROR: unable to extract media URL')
1551 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1553 # if needed add http://www.dailymotion.com/ if relative URL
1555 video_url = mediaURL
1557 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1559 self._downloader.trouble(u'ERROR: unable to extract title')
1561 video_title = mobj.group(1).decode('utf-8')
1562 video_title = sanitize_title(video_title)
1564 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1566 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1568 video_uploader = mobj.group(1)
1571 # Process video information
1572 self._downloader.process_info({
1573 'id': video_id.decode('utf-8'),
1574 'url': video_url.decode('utf-8'),
1575 'uploader': video_uploader.decode('utf-8'),
1576 'upload_date': u'NA',
1577 'title': video_title,
1578 'stitle': simple_title,
1579 'ext': video_extension.decode('utf-8'),
1583 except UnavailableVideoError:
1584 self._downloader.trouble(u'\nERROR: unable to download video')
1587 class GoogleIE(InfoExtractor):
1588 """Information extractor for video.google.com."""
1590 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1592 def __init__(self, downloader=None):
1593 InfoExtractor.__init__(self, downloader)
1597 return (re.match(GoogleIE._VALID_URL, url) is not None)
1599 def report_download_webpage(self, video_id):
1600 """Report webpage download."""
1601 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1603 def report_extraction(self, video_id):
1604 """Report information extraction."""
1605 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1607 def _real_initialize(self):
1610 def _real_extract(self, url):
1611 # Extract id from URL
1612 mobj = re.match(self._VALID_URL, url)
1614 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1617 # At this point we have a new video
1618 self._downloader.increment_downloads()
1619 video_id = mobj.group(1)
1621 video_extension = 'mp4'
1623 # Retrieve video webpage to extract further information
1624 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1626 self.report_download_webpage(video_id)
1627 webpage = urllib2.urlopen(request).read()
1628 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1629 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1632 # Extract URL, uploader, and title from webpage
1633 self.report_extraction(video_id)
1634 mobj = re.search(r"download_url:'([^']+)'", webpage)
1636 video_extension = 'flv'
1637 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1639 self._downloader.trouble(u'ERROR: unable to extract media URL')
1641 mediaURL = urllib.unquote(mobj.group(1))
1642 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1643 mediaURL = mediaURL.replace('\\x26', '\x26')
1645 video_url = mediaURL
1647 mobj = re.search(r'<title>(.*)</title>', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract title')
1651 video_title = mobj.group(1).decode('utf-8')
1652 video_title = sanitize_title(video_title)
1653 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1655 # Extract video description
1656 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract video description')
1660 video_description = mobj.group(1).decode('utf-8')
1661 if not video_description:
1662 video_description = 'No description available.'
1664 # Extract video thumbnail
1665 if self._downloader.params.get('forcethumbnail', False):
1666 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1668 webpage = urllib2.urlopen(request).read()
1669 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1670 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1672 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1674 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1676 video_thumbnail = mobj.group(1)
1677 else: # we need something to pass to process_info
1678 video_thumbnail = ''
1681 # Process video information
1682 self._downloader.process_info({
1683 'id': video_id.decode('utf-8'),
1684 'url': video_url.decode('utf-8'),
1686 'upload_date': u'NA',
1687 'title': video_title,
1688 'stitle': simple_title,
1689 'ext': video_extension.decode('utf-8'),
1693 except UnavailableVideoError:
1694 self._downloader.trouble(u'\nERROR: unable to download video')
1697 class PhotobucketIE(InfoExtractor):
1698 """Information extractor for photobucket.com."""
1700 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1707 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1709 def report_download_webpage(self, video_id):
1710 """Report webpage download."""
1711 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1713 def report_extraction(self, video_id):
1714 """Report information extraction."""
1715 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1717 def _real_initialize(self):
1720 def _real_extract(self, url):
1721 # Extract id from URL
1722 mobj = re.match(self._VALID_URL, url)
1724 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1727 # At this point we have a new video
1728 self._downloader.increment_downloads()
1729 video_id = mobj.group(1)
1731 video_extension = 'flv'
1733 # Retrieve video webpage to extract further information
1734 request = urllib2.Request(url)
1736 self.report_download_webpage(video_id)
1737 webpage = urllib2.urlopen(request).read()
1738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1739 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1742 # Extract URL, uploader, and title from webpage
1743 self.report_extraction(video_id)
1744 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1746 self._downloader.trouble(u'ERROR: unable to extract media URL')
1748 mediaURL = urllib.unquote(mobj.group(1))
1750 video_url = mediaURL
1752 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1754 self._downloader.trouble(u'ERROR: unable to extract title')
1756 video_title = mobj.group(1).decode('utf-8')
1757 video_title = sanitize_title(video_title)
1758 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1760 video_uploader = mobj.group(2).decode('utf-8')
1763 # Process video information
1764 self._downloader.process_info({
1765 'id': video_id.decode('utf-8'),
1766 'url': video_url.decode('utf-8'),
1767 'uploader': video_uploader,
1768 'upload_date': u'NA',
1769 'title': video_title,
1770 'stitle': simple_title,
1771 'ext': video_extension.decode('utf-8'),
1775 except UnavailableVideoError:
1776 self._downloader.trouble(u'\nERROR: unable to download video')
1779 class YahooIE(InfoExtractor):
1780 """Information extractor for video.yahoo.com."""
1782 # _VALID_URL matches all Yahoo! Video URLs
1783 # _VPAGE_URL matches only the extractable '/watch/' URLs
1784 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1785 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1787 def __init__(self, downloader=None):
1788 InfoExtractor.__init__(self, downloader)
1792 return (re.match(YahooIE._VALID_URL, url) is not None)
1794 def report_download_webpage(self, video_id):
1795 """Report webpage download."""
1796 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1798 def report_extraction(self, video_id):
1799 """Report information extraction."""
1800 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1802 def _real_initialize(self):
1805 def _real_extract(self, url, new_video=True):
1806 # Extract ID from URL
1807 mobj = re.match(self._VALID_URL, url)
1809 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1812 # At this point we have a new video
1813 self._downloader.increment_downloads()
1814 video_id = mobj.group(2)
1815 video_extension = 'flv'
1817 # Rewrite valid but non-extractable URLs as
1818 # extractable English language /watch/ URLs
1819 if re.match(self._VPAGE_URL, url) is None:
1820 request = urllib2.Request(url)
1822 webpage = urllib2.urlopen(request).read()
1823 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1824 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1827 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1829 self._downloader.trouble(u'ERROR: Unable to extract id field')
1831 yahoo_id = mobj.group(1)
1833 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1835 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1837 yahoo_vid = mobj.group(1)
1839 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1840 return self._real_extract(url, new_video=False)
1842 # Retrieve video webpage to extract further information
1843 request = urllib2.Request(url)
1845 self.report_download_webpage(video_id)
1846 webpage = urllib2.urlopen(request).read()
1847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1851 # Extract uploader and title from webpage
1852 self.report_extraction(video_id)
1853 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video title')
1857 video_title = mobj.group(1).decode('utf-8')
1858 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1860 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1862 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1864 video_uploader = mobj.group(1).decode('utf-8')
1866 # Extract video thumbnail
1867 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1869 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1871 video_thumbnail = mobj.group(1).decode('utf-8')
1873 # Extract video description
1874 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1876 self._downloader.trouble(u'ERROR: unable to extract video description')
1878 video_description = mobj.group(1).decode('utf-8')
1879 if not video_description:
1880 video_description = 'No description available.'
1882 # Extract video height and width
1883 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1885 self._downloader.trouble(u'ERROR: unable to extract video height')
1887 yv_video_height = mobj.group(1)
1889 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1891 self._downloader.trouble(u'ERROR: unable to extract video width')
1893 yv_video_width = mobj.group(1)
1895 # Retrieve video playlist to extract media URL
1896 # I'm not completely sure what all these options are, but we
1897 # seem to need most of them, otherwise the server sends a 401.
1898 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1899 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1900 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1901 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1902 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1904 self.report_download_webpage(video_id)
1905 webpage = urllib2.urlopen(request).read()
1906 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1910 # Extract media URL from playlist XML
1911 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1913 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1915 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1916 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1919 # Process video information
1920 self._downloader.process_info({
1921 'id': video_id.decode('utf-8'),
1923 'uploader': video_uploader,
1924 'upload_date': u'NA',
1925 'title': video_title,
1926 'stitle': simple_title,
1927 'ext': video_extension.decode('utf-8'),
1928 'thumbnail': video_thumbnail.decode('utf-8'),
1929 'description': video_description,
1930 'thumbnail': video_thumbnail,
1933 except UnavailableVideoError:
1934 self._downloader.trouble(u'\nERROR: unable to download video')
1937 class VimeoIE(InfoExtractor):
1938 """Information extractor for vimeo.com."""
1940 # _VALID_URL matches Vimeo URLs
1941 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1943 def __init__(self, downloader=None):
1944 InfoExtractor.__init__(self, downloader)
1948 return (re.match(VimeoIE._VALID_URL, url) is not None)
1950 def report_download_webpage(self, video_id):
1951 """Report webpage download."""
1952 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1954 def report_extraction(self, video_id):
1955 """Report information extraction."""
1956 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1958 def _real_initialize(self):
1961 def _real_extract(self, url, new_video=True):
1962 # Extract ID from URL
1963 mobj = re.match(self._VALID_URL, url)
1965 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1968 # At this point we have a new video
1969 self._downloader.increment_downloads()
1970 video_id = mobj.group(1)
1972 # Retrieve video webpage to extract further information
1973 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1975 self.report_download_webpage(video_id)
1976 webpage = urllib2.urlopen(request).read()
1977 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1978 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1981 # Now we begin extracting as much information as we can from what we
1982 # retrieved. First we extract the information common to all extractors,
1983 # and latter we extract those that are Vimeo specific.
1984 self.report_extraction(video_id)
1987 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1989 self._downloader.trouble(u'ERROR: unable to extract video title')
1991 video_title = mobj.group(1).decode('utf-8')
1992 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1995 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1997 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1999 video_uploader = mobj.group(1).decode('utf-8')
2001 # Extract video thumbnail
2002 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2004 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2006 video_thumbnail = mobj.group(1).decode('utf-8')
2008 # # Extract video description
2009 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2011 # self._downloader.trouble(u'ERROR: unable to extract video description')
2013 # video_description = mobj.group(1).decode('utf-8')
2014 # if not video_description: video_description = 'No description available.'
2015 video_description = 'Foo.'
2017 # Vimeo specific: extract request signature
2018 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2020 self._downloader.trouble(u'ERROR: unable to extract request signature')
2022 sig = mobj.group(1).decode('utf-8')
2024 # Vimeo specific: Extract request signature expiration
2025 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2027 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2029 sig_exp = mobj.group(1).decode('utf-8')
2031 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2034 # Process video information
2035 self._downloader.process_info({
2036 'id': video_id.decode('utf-8'),
2038 'uploader': video_uploader,
2039 'upload_date': u'NA',
2040 'title': video_title,
2041 'stitle': simple_title,
2043 'thumbnail': video_thumbnail.decode('utf-8'),
2044 'description': video_description,
2045 'thumbnail': video_thumbnail,
2046 'description': video_description,
2049 except UnavailableVideoError:
2050 self._downloader.trouble(u'ERROR: unable to download video')
2053 class GenericIE(InfoExtractor):
2054 """Generic last-resort information extractor."""
2056 def __init__(self, downloader=None):
2057 InfoExtractor.__init__(self, downloader)
2063 def report_download_webpage(self, video_id):
2064 """Report webpage download."""
2065 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2066 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2068 def report_extraction(self, video_id):
2069 """Report information extraction."""
2070 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2072 def _real_initialize(self):
2075 def _real_extract(self, url):
2076 # At this point we have a new video
2077 self._downloader.increment_downloads()
2079 video_id = url.split('/')[-1]
2080 request = urllib2.Request(url)
2082 self.report_download_webpage(video_id)
2083 webpage = urllib2.urlopen(request).read()
2084 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2085 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2087 except ValueError, err:
2088 # since this is the last-resort InfoExtractor, if
2089 # this error is thrown, it'll be thrown here
2090 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2093 self.report_extraction(video_id)
2094 # Start with something easy: JW Player in SWFObject
2095 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2097 # Broaden the search a little bit
2098 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2100 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2103 # It's possible that one of the regexes
2104 # matched, but returned an empty group:
2105 if mobj.group(1) is None:
2106 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2109 video_url = urllib.unquote(mobj.group(1))
2110 video_id = os.path.basename(video_url)
2112 # here's a fun little line of code for you:
2113 video_extension = os.path.splitext(video_id)[1][1:]
2114 video_id = os.path.splitext(video_id)[0]
2116 # it's tempting to parse this further, but you would
2117 # have to take into account all the variations like
2118 # Video Title - Site Name
2119 # Site Name | Video Title
2120 # Video Title - Tagline | Site Name
2121 # and so on and so forth; it's just not practical
2122 mobj = re.search(r'<title>(.*)</title>', webpage)
2124 self._downloader.trouble(u'ERROR: unable to extract title')
2126 video_title = mobj.group(1).decode('utf-8')
2127 video_title = sanitize_title(video_title)
2128 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2130 # video uploader is domain name
2131 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2133 self._downloader.trouble(u'ERROR: unable to extract title')
2135 video_uploader = mobj.group(1).decode('utf-8')
2138 # Process video information
2139 self._downloader.process_info({
2140 'id': video_id.decode('utf-8'),
2141 'url': video_url.decode('utf-8'),
2142 'uploader': video_uploader,
2143 'upload_date': u'NA',
2144 'title': video_title,
2145 'stitle': simple_title,
2146 'ext': video_extension.decode('utf-8'),
2150 except UnavailableVideoError, err:
2151 self._downloader.trouble(u'\nERROR: unable to download video')
2154 class YoutubeSearchIE(InfoExtractor):
2155 """Information Extractor for YouTube search queries."""
2156 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2157 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2158 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2159 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2161 _max_youtube_results = 1000
2163 def __init__(self, youtube_ie, downloader=None):
2164 InfoExtractor.__init__(self, downloader)
2165 self._youtube_ie = youtube_ie
2169 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2171 def report_download_page(self, query, pagenum):
2172 """Report attempt to download playlist page with given number."""
2173 query = query.decode(preferredencoding())
2174 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2176 def _real_initialize(self):
2177 self._youtube_ie.initialize()
2179 def _real_extract(self, query):
2180 mobj = re.match(self._VALID_QUERY, query)
2182 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2185 prefix, query = query.split(':')
2187 query = query.encode('utf-8')
2189 self._download_n_results(query, 1)
2191 elif prefix == 'all':
2192 self._download_n_results(query, self._max_youtube_results)
2198 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2200 elif n > self._max_youtube_results:
2201 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2202 n = self._max_youtube_results
2203 self._download_n_results(query, n)
2205 except ValueError: # parsing prefix as integer fails
2206 self._download_n_results(query, 1)
2209 def _download_n_results(self, query, n):
2210 """Downloads a specified number of results for a query"""
2213 already_seen = set()
2217 self.report_download_page(query, pagenum)
2218 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2219 request = urllib2.Request(result_url)
2221 page = urllib2.urlopen(request).read()
2222 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2223 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2226 # Extract video identifiers
2227 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2228 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2229 if video_id not in already_seen:
2230 video_ids.append(video_id)
2231 already_seen.add(video_id)
2232 if len(video_ids) == n:
2233 # Specified n videos reached
2234 for id in video_ids:
2235 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2238 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2239 for id in video_ids:
2240 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2243 pagenum = pagenum + 1
2246 class GoogleSearchIE(InfoExtractor):
2247 """Information Extractor for Google Video search queries."""
2248 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2249 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2250 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2251 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2253 _max_google_results = 1000
2255 def __init__(self, google_ie, downloader=None):
2256 InfoExtractor.__init__(self, downloader)
2257 self._google_ie = google_ie
2261 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2263 def report_download_page(self, query, pagenum):
2264 """Report attempt to download playlist page with given number."""
2265 query = query.decode(preferredencoding())
2266 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2268 def _real_initialize(self):
2269 self._google_ie.initialize()
2271 def _real_extract(self, query):
2272 mobj = re.match(self._VALID_QUERY, query)
2274 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2277 prefix, query = query.split(':')
2279 query = query.encode('utf-8')
2281 self._download_n_results(query, 1)
2283 elif prefix == 'all':
2284 self._download_n_results(query, self._max_google_results)
2290 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2292 elif n > self._max_google_results:
2293 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2294 n = self._max_google_results
2295 self._download_n_results(query, n)
2297 except ValueError: # parsing prefix as integer fails
2298 self._download_n_results(query, 1)
2301 def _download_n_results(self, query, n):
2302 """Downloads a specified number of results for a query"""
2305 already_seen = set()
2309 self.report_download_page(query, pagenum)
2310 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2311 request = urllib2.Request(result_url)
2313 page = urllib2.urlopen(request).read()
2314 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2315 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2318 # Extract video identifiers
2319 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2320 video_id = mobj.group(1)
2321 if video_id not in already_seen:
2322 video_ids.append(video_id)
2323 already_seen.add(video_id)
2324 if len(video_ids) == n:
2325 # Specified n videos reached
2326 for id in video_ids:
2327 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2330 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2331 for id in video_ids:
2332 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2335 pagenum = pagenum + 1
2338 class YahooSearchIE(InfoExtractor):
2339 """Information Extractor for Yahoo! Video search queries."""
2340 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2341 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2342 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2343 _MORE_PAGES_INDICATOR = r'\s*Next'
2345 _max_yahoo_results = 1000
2347 def __init__(self, yahoo_ie, downloader=None):
2348 InfoExtractor.__init__(self, downloader)
2349 self._yahoo_ie = yahoo_ie
2353 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2355 def report_download_page(self, query, pagenum):
2356 """Report attempt to download playlist page with given number."""
2357 query = query.decode(preferredencoding())
2358 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2360 def _real_initialize(self):
2361 self._yahoo_ie.initialize()
2363 def _real_extract(self, query):
2364 mobj = re.match(self._VALID_QUERY, query)
2366 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2369 prefix, query = query.split(':')
2371 query = query.encode('utf-8')
2373 self._download_n_results(query, 1)
2375 elif prefix == 'all':
2376 self._download_n_results(query, self._max_yahoo_results)
2382 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2384 elif n > self._max_yahoo_results:
2385 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2386 n = self._max_yahoo_results
2387 self._download_n_results(query, n)
2389 except ValueError: # parsing prefix as integer fails
2390 self._download_n_results(query, 1)
2393 def _download_n_results(self, query, n):
2394 """Downloads a specified number of results for a query"""
2397 already_seen = set()
2401 self.report_download_page(query, pagenum)
2402 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2403 request = urllib2.Request(result_url)
2405 page = urllib2.urlopen(request).read()
2406 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2407 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2410 # Extract video identifiers
2411 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2412 video_id = mobj.group(1)
2413 if video_id not in already_seen:
2414 video_ids.append(video_id)
2415 already_seen.add(video_id)
2416 if len(video_ids) == n:
2417 # Specified n videos reached
2418 for id in video_ids:
2419 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2422 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2423 for id in video_ids:
2424 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2427 pagenum = pagenum + 1
2430 class YoutubePlaylistIE(InfoExtractor):
2431 """Information Extractor for YouTube playlists."""
2433 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2434 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2435 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2436 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2439 def __init__(self, youtube_ie, downloader=None):
2440 InfoExtractor.__init__(self, downloader)
2441 self._youtube_ie = youtube_ie
2445 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2447 def report_download_page(self, playlist_id, pagenum):
2448 """Report attempt to download playlist page with given number."""
2449 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2451 def _real_initialize(self):
2452 self._youtube_ie.initialize()
2454 def _real_extract(self, url):
2455 # Extract playlist id
2456 mobj = re.match(self._VALID_URL, url)
2458 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2462 if mobj.group(3) is not None:
2463 self._youtube_ie.extract(mobj.group(3))
2466 # Download playlist pages
2467 # prefix is 'p' as default for playlists but there are other types that need extra care
2468 playlist_prefix = mobj.group(1)
2469 if playlist_prefix == 'a':
2470 playlist_access = 'artist'
2472 playlist_prefix = 'p'
2473 playlist_access = 'view_play_list'
2474 playlist_id = mobj.group(2)
2479 self.report_download_page(playlist_id, pagenum)
2480 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2482 page = urllib2.urlopen(request).read()
2483 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2484 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2487 # Extract video identifiers
2489 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2490 if mobj.group(1) not in ids_in_page:
2491 ids_in_page.append(mobj.group(1))
2492 video_ids.extend(ids_in_page)
2494 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2496 pagenum = pagenum + 1
2498 playliststart = self._downloader.params.get('playliststart', 1) - 1
2499 playlistend = self._downloader.params.get('playlistend', -1)
2500 video_ids = video_ids[playliststart:playlistend]
2502 for id in video_ids:
2503 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2507 class YoutubeUserIE(InfoExtractor):
2508 """Information Extractor for YouTube users."""
2510 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2511 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2512 _GDATA_PAGE_SIZE = 50
2513 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2514 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2517 def __init__(self, youtube_ie, downloader=None):
2518 InfoExtractor.__init__(self, downloader)
2519 self._youtube_ie = youtube_ie
2523 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2525 def report_download_page(self, username, start_index):
2526 """Report attempt to download user page."""
2527 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2528 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2530 def _real_initialize(self):
2531 self._youtube_ie.initialize()
2533 def _real_extract(self, url):
2535 mobj = re.match(self._VALID_URL, url)
2537 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2540 username = mobj.group(1)
2542 # Download video ids using YouTube Data API. Result size per
2543 # query is limited (currently to 50 videos) so we need to query
2544 # page by page until there are no video ids - it means we got
2551 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2552 self.report_download_page(username, start_index)
2554 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2557 page = urllib2.urlopen(request).read()
2558 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2559 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2562 # Extract video identifiers
2565 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2566 if mobj.group(1) not in ids_in_page:
2567 ids_in_page.append(mobj.group(1))
2569 video_ids.extend(ids_in_page)
2571 # A little optimization - if current page is not
2572 # "full", ie. does not contain PAGE_SIZE video ids then
2573 # we can assume that this page is the last one - there
2574 # are no more ids on further pages - no need to query
2577 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2582 all_ids_count = len(video_ids)
2583 playliststart = self._downloader.params.get('playliststart', 1) - 1
2584 playlistend = self._downloader.params.get('playlistend', -1)
2586 if playlistend == -1:
2587 video_ids = video_ids[playliststart:]
2589 video_ids = video_ids[playliststart:playlistend]
2591 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2592 (username, all_ids_count, len(video_ids)))
2594 for video_id in video_ids:
2595 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2598 class DepositFilesIE(InfoExtractor):
2599 """Information extractor for depositfiles.com"""
2601 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2603 def __init__(self, downloader=None):
2604 InfoExtractor.__init__(self, downloader)
2608 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2610 def report_download_webpage(self, file_id):
2611 """Report webpage download."""
2612 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2614 def report_extraction(self, file_id):
2615 """Report information extraction."""
2616 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2618 def _real_initialize(self):
2621 def _real_extract(self, url):
2622 # At this point we have a new file
2623 self._downloader.increment_downloads()
2625 file_id = url.split('/')[-1]
2626 # Rebuild url in english locale
2627 url = 'http://depositfiles.com/en/files/' + file_id
2629 # Retrieve file webpage with 'Free download' button pressed
2630 free_download_indication = { 'gateway_result' : '1' }
2631 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2633 self.report_download_webpage(file_id)
2634 webpage = urllib2.urlopen(request).read()
2635 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2636 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2639 # Search for the real file URL
2640 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2641 if (mobj is None) or (mobj.group(1) is None):
2642 # Try to figure out reason of the error.
2643 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2644 if (mobj is not None) and (mobj.group(1) is not None):
2645 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2646 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2648 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2651 file_url = mobj.group(1)
2652 file_extension = os.path.splitext(file_url)[1][1:]
2654 # Search for file title
2655 mobj = re.search(r'<b title="(.*?)">', webpage)
2657 self._downloader.trouble(u'ERROR: unable to extract title')
2659 file_title = mobj.group(1).decode('utf-8')
2662 # Process file information
2663 self._downloader.process_info({
2664 'id': file_id.decode('utf-8'),
2665 'url': file_url.decode('utf-8'),
2667 'upload_date': u'NA',
2668 'title': file_title,
2669 'stitle': file_title,
2670 'ext': file_extension.decode('utf-8'),
2674 except UnavailableVideoError, err:
2675 self._downloader.trouble(u'ERROR: unable to download file')
2678 class FacebookIE(InfoExtractor):
2679 """Information Extractor for Facebook"""
2681 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2682 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2683 _NETRC_MACHINE = 'facebook'
2684 _available_formats = ['highqual', 'lowqual']
2685 _video_extensions = {
2690 def __init__(self, downloader=None):
2691 InfoExtractor.__init__(self, downloader)
2695 return (re.match(FacebookIE._VALID_URL, url) is not None)
2697 def _reporter(self, message):
2698 """Add header and report message."""
2699 self._downloader.to_screen(u'[facebook] %s' % message)
2701 def report_login(self):
2702 """Report attempt to log in."""
2703 self._reporter(u'Logging in')
2705 def report_video_webpage_download(self, video_id):
2706 """Report attempt to download video webpage."""
2707 self._reporter(u'%s: Downloading video webpage' % video_id)
2709 def report_information_extraction(self, video_id):
2710 """Report attempt to extract video information."""
2711 self._reporter(u'%s: Extracting video information' % video_id)
2713 def _parse_page(self, video_webpage):
2714 """Extract video information from page"""
2716 data = {'title': r'class="video_title datawrap">(.*?)</',
2717 'description': r'<div class="datawrap">(.*?)</div>',
2718 'owner': r'\("video_owner_name", "(.*?)"\)',
2719 'upload_date': r'data-date="(.*?)"',
2720 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2723 for piece in data.keys():
2724 mobj = re.search(data[piece], video_webpage)
2725 if mobj is not None:
2726 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2730 for fmt in self._available_formats:
2731 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2732 if mobj is not None:
2733 # URL is in a Javascript segment inside an escaped Unicode format within
2734 # the generally utf-8 page
2735 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2736 video_info['video_urls'] = video_urls
2740 def _real_initialize(self):
2741 if self._downloader is None:
2746 downloader_params = self._downloader.params
2748 # Attempt to use provided username and password or .netrc data
2749 if downloader_params.get('username', None) is not None:
2750 useremail = downloader_params['username']
2751 password = downloader_params['password']
2752 elif downloader_params.get('usenetrc', False):
2754 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2755 if info is not None:
2759 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2760 except (IOError, netrc.NetrcParseError), err:
2761 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2764 if useremail is None:
2773 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2776 login_results = urllib2.urlopen(request).read()
2777 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2778 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2780 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2781 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2784 def _real_extract(self, url):
2785 mobj = re.match(self._VALID_URL, url)
2787 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2789 video_id = mobj.group('ID')
2792 self.report_video_webpage_download(video_id)
2793 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2795 page = urllib2.urlopen(request)
2796 video_webpage = page.read()
2797 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2798 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2801 # Start extracting information
2802 self.report_information_extraction(video_id)
2804 # Extract information
2805 video_info = self._parse_page(video_webpage)
2808 if 'owner' not in video_info:
2809 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2811 video_uploader = video_info['owner']
2814 if 'title' not in video_info:
2815 self._downloader.trouble(u'ERROR: unable to extract video title')
2817 video_title = video_info['title']
2818 video_title = video_title.decode('utf-8')
2819 video_title = sanitize_title(video_title)
2822 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2823 simple_title = simple_title.strip(ur'_')
2826 if 'thumbnail' not in video_info:
2827 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2828 video_thumbnail = ''
2830 video_thumbnail = video_info['thumbnail']
2834 if 'upload_date' in video_info:
2835 upload_time = video_info['upload_date']
2836 timetuple = email.utils.parsedate_tz(upload_time)
2837 if timetuple is not None:
2839 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2844 video_description = video_info.get('description', 'No description available.')
2846 url_map = video_info['video_urls']
2847 if len(url_map.keys()) > 0:
2848 # Decide which formats to download
2849 req_format = self._downloader.params.get('format', None)
2850 format_limit = self._downloader.params.get('format_limit', None)
2852 if format_limit is not None and format_limit in self._available_formats:
2853 format_list = self._available_formats[self._available_formats.index(format_limit):]
2855 format_list = self._available_formats
2856 existing_formats = [x for x in format_list if x in url_map]
2857 if len(existing_formats) == 0:
2858 self._downloader.trouble(u'ERROR: no known formats available for video')
2860 if req_format is None:
2861 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2862 elif req_format == '-1':
2863 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2866 if req_format not in url_map:
2867 self._downloader.trouble(u'ERROR: requested format not available')
2869 video_url_list = [(req_format, url_map[req_format])] # Specific format
2871 for format_param, video_real_url in video_url_list:
2873 # At this point we have a new video
2874 self._downloader.increment_downloads()
2877 video_extension = self._video_extensions.get(format_param, 'mp4')
2880 # Process video information
2881 self._downloader.process_info({
2882 'id': video_id.decode('utf-8'),
2883 'url': video_real_url.decode('utf-8'),
2884 'uploader': video_uploader.decode('utf-8'),
2885 'upload_date': upload_date,
2886 'title': video_title,
2887 'stitle': simple_title,
2888 'ext': video_extension.decode('utf-8'),
2889 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2890 'thumbnail': video_thumbnail.decode('utf-8'),
2891 'description': video_description.decode('utf-8'),
2894 except UnavailableVideoError, err:
2895 self._downloader.trouble(u'\nERROR: unable to download video')
2897 class BlipTVIE(InfoExtractor):
2898 """Information extractor for blip.tv"""
2900 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2901 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2905 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2907 def report_extraction(self, file_id):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2911 def _simplify_title(self, title):
2912 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2913 res = res.strip(ur'_')
2916 def _real_extract(self, url):
2917 mobj = re.match(self._VALID_URL, url)
2919 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2926 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2927 request = urllib2.Request(json_url)
2928 self.report_extraction(mobj.group(1))
2930 json_code = urllib2.urlopen(request).read()
2931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2932 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2935 json_data = json.loads(json_code)
2936 if 'Post' in json_data:
2937 data = json_data['Post']
2941 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2942 video_url = data['media']['url']
2943 umobj = re.match(self._URL_EXT, video_url)
2945 raise ValueError('Can not determine filename extension')
2946 ext = umobj.group(1)
2948 self._downloader.increment_downloads()
2951 'id': data['item_id'],
2953 'uploader': data['display_name'],
2954 'upload_date': upload_date,
2955 'title': data['title'],
2956 'stitle': self._simplify_title(data['title']),
2958 'format': data['media']['mimeType'],
2959 'thumbnail': data['thumbnailUrl'],
2960 'description': data['description'],
2961 'player_url': data['embedUrl']
2963 except (ValueError,KeyError), err:
2964 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2968 self._downloader.process_info(info)
2969 except UnavailableVideoError, err:
2970 self._downloader.trouble(u'\nERROR: unable to download video')
2973 class MyVideoIE(InfoExtractor):
2974 """Information Extractor for myvideo.de."""
2976 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2978 def __init__(self, downloader=None):
2979 InfoExtractor.__init__(self, downloader)
2983 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2985 def report_download_webpage(self, video_id):
2986 """Report webpage download."""
2987 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2989 def report_extraction(self, video_id):
2990 """Report information extraction."""
2991 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2993 def _real_initialize(self):
2996 def _real_extract(self,url):
2997 mobj = re.match(self._VALID_URL, url)
2999 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3002 video_id = mobj.group(1)
3003 simple_title = mobj.group(2).decode('utf-8')
3004 # should actually not be necessary
3005 simple_title = sanitize_title(simple_title)
3006 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3009 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3011 self.report_download_webpage(video_id)
3012 webpage = urllib2.urlopen(request).read()
3013 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3014 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3017 self.report_extraction(video_id)
3018 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3021 self._downloader.trouble(u'ERROR: unable to extract media URL')
3023 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3025 mobj = re.search('<title>([^<]+)</title>', webpage)
3027 self._downloader.trouble(u'ERROR: unable to extract title')
3030 video_title = mobj.group(1)
3031 video_title = sanitize_title(video_title)
3035 self._downloader.process_info({
3039 'upload_date': u'NA',
3040 'title': video_title,
3041 'stitle': simple_title,
3046 except UnavailableVideoError:
3047 self._downloader.trouble(u'\nERROR: Unable to download video')
3049 class ComedyCentralIE(InfoExtractor):
3050 """Information extractor for The Daily Show and Colbert Report """
3052 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3056 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3058 def report_extraction(self, episode_id):
3059 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3061 def report_config_download(self, episode_id):
3062 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3064 def report_index_download(self, episode_id):
3065 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3067 def report_player_url(self, episode_id):
3068 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3070 def _simplify_title(self, title):
3071 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3072 res = res.strip(ur'_')
3075 def _real_extract(self, url):
3076 mobj = re.match(self._VALID_URL, url)
3078 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3081 if mobj.group('shortname'):
3082 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3083 url = 'http://www.thedailyshow.com/full-episodes/'
3085 url = 'http://www.colbertnation.com/full-episodes/'
3086 mobj = re.match(self._VALID_URL, url)
3087 assert mobj is not None
3089 dlNewest = not mobj.group('episode')
3091 epTitle = mobj.group('showname')
3093 epTitle = mobj.group('episode')
3095 req = urllib2.Request(url)
3096 self.report_extraction(epTitle)
3098 htmlHandle = urllib2.urlopen(req)
3099 html = htmlHandle.read()
3100 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3101 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3104 url = htmlHandle.geturl()
3105 mobj = re.match(self._VALID_URL, url)
3107 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3109 if mobj.group('episode') == '':
3110 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3112 epTitle = mobj.group('episode')
3114 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3115 if len(mMovieParams) == 0:
3116 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3119 playerUrl_raw = mMovieParams[0][0]
3120 self.report_player_url(epTitle)
3122 urlHandle = urllib2.urlopen(playerUrl_raw)
3123 playerUrl = urlHandle.geturl()
3124 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3125 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3128 uri = mMovieParams[0][1]
3129 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3130 self.report_index_download(epTitle)
3132 indexXml = urllib2.urlopen(indexUrl).read()
3133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3134 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3137 idoc = xml.etree.ElementTree.fromstring(indexXml)
3138 itemEls = idoc.findall('.//item')
3139 for itemEl in itemEls:
3140 mediaId = itemEl.findall('./guid')[0].text
3141 shortMediaId = mediaId.split(':')[-1]
3142 showId = mediaId.split(':')[-2].replace('.com', '')
3143 officialTitle = itemEl.findall('./title')[0].text
3144 officialDate = itemEl.findall('./pubDate')[0].text
3146 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3147 urllib.urlencode({'uri': mediaId}))
3148 configReq = urllib2.Request(configUrl)
3149 self.report_config_download(epTitle)
3151 configXml = urllib2.urlopen(configReq).read()
3152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3153 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3156 cdoc = xml.etree.ElementTree.fromstring(configXml)
3158 for rendition in cdoc.findall('.//rendition'):
3159 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3163 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3166 # For now, just pick the highest bitrate
3167 format,video_url = turls[-1]
3169 self._downloader.increment_downloads()
3171 effTitle = showId + '-' + epTitle
3176 'upload_date': officialDate,
3178 'stitle': self._simplify_title(effTitle),
3182 'description': officialTitle,
3183 'player_url': playerUrl
3187 self._downloader.process_info(info)
3188 except UnavailableVideoError, err:
3189 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3193 class EscapistIE(InfoExtractor):
3194 """Information extractor for The Escapist """
3196 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3200 return (re.match(EscapistIE._VALID_URL, url) is not None)
3202 def report_extraction(self, showName):
3203 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3205 def report_config_download(self, showName):
3206 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3208 def _simplify_title(self, title):
3209 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3210 res = res.strip(ur'_')
3213 def _real_extract(self, url):
3214 htmlParser = HTMLParser.HTMLParser()
3216 mobj = re.match(self._VALID_URL, url)
3218 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3220 showName = mobj.group('showname')
3221 videoId = mobj.group('episode')
3223 self.report_extraction(showName)
3225 webPage = urllib2.urlopen(url).read()
3226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3230 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3231 description = htmlParser.unescape(descMatch.group(1))
3232 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3233 imgUrl = htmlParser.unescape(imgMatch.group(1))
3234 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3235 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3236 configUrlMatch = re.search('config=(.*)$', playerUrl)
3237 configUrl = urllib2.unquote(configUrlMatch.group(1))
3239 self.report_config_download(showName)
3241 configJSON = urllib2.urlopen(configUrl).read()
3242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3243 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3246 # Technically, it's JavaScript, not JSON
3247 configJSON = configJSON.replace("'", '"')
3250 config = json.loads(configJSON)
3251 except (ValueError,), err:
3252 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3255 playlist = config['playlist']
3256 videoUrl = playlist[1]['url']
3258 self._downloader.increment_downloads()
3262 'uploader': showName,
3263 'upload_date': None,
3265 'stitle': self._simplify_title(showName),
3268 'thumbnail': imgUrl,
3269 'description': description,
3270 'player_url': playerUrl,
3274 self._downloader.process_info(info)
3275 except UnavailableVideoError, err:
3276 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3280 class PostProcessor(object):
3281 """Post Processor class.
3283 PostProcessor objects can be added to downloaders with their
3284 add_post_processor() method. When the downloader has finished a
3285 successful download, it will take its internal chain of PostProcessors
3286 and start calling the run() method on each one of them, first with
3287 an initial argument and then with the returned value of the previous
3290 The chain will be stopped if one of them ever returns None or the end
3291 of the chain is reached.
3293 PostProcessor objects follow a "mutual registration" process similar
3294 to InfoExtractor objects.
3299 def __init__(self, downloader=None):
3300 self._downloader = downloader
3302 def set_downloader(self, downloader):
3303 """Sets the downloader for this PP."""
3304 self._downloader = downloader
3306 def run(self, information):
3307 """Run the PostProcessor.
3309 The "information" argument is a dictionary like the ones
3310 composed by InfoExtractors. The only difference is that this
3311 one has an extra field called "filepath" that points to the
3314 When this method returns None, the postprocessing chain is
3315 stopped. However, this method may return an information
3316 dictionary that will be passed to the next postprocessing
3317 object in the chain. It can be the one it received after
3318 changing some fields.
3320 In addition, this method may raise a PostProcessingError
3321 exception that will be taken into account by the downloader
3324 return information # by default, do nothing
3327 class FFmpegExtractAudioPP(PostProcessor):
3329 def __init__(self, downloader=None, preferredcodec=None):
3330 PostProcessor.__init__(self, downloader)
3331 if preferredcodec is None:
3332 preferredcodec = 'best'
3333 self._preferredcodec = preferredcodec
3336 def get_audio_codec(path):
3338 cmd = ['ffprobe', '-show_streams', '--', path]
3339 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3340 output = handle.communicate()[0]
3341 if handle.wait() != 0:
3343 except (IOError, OSError):
3346 for line in output.split('\n'):
3347 if line.startswith('codec_name='):
3348 audio_codec = line.split('=')[1].strip()
3349 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3354 def run_ffmpeg(path, out_path, codec, more_opts):
3356 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3357 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3359 except (IOError, OSError):
3362 def run(self, information):
3363 path = information['filepath']
3365 filecodec = self.get_audio_codec(path)
3366 if filecodec is None:
3367 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3371 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3372 if filecodec == 'aac' or filecodec == 'mp3':
3373 # Lossless if possible
3375 extension = filecodec
3376 if filecodec == 'aac':
3377 more_opts = ['-f', 'adts']
3380 acodec = 'libmp3lame'
3382 more_opts = ['-ab', '128k']
3384 # We convert the audio (lossy)
3385 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3386 extension = self._preferredcodec
3387 more_opts = ['-ab', '128k']
3388 if self._preferredcodec == 'aac':
3389 more_opts += ['-f', 'adts']
3391 (prefix, ext) = os.path.splitext(path)
3392 new_path = prefix + '.' + extension
3393 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3394 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3397 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3402 except (IOError, OSError):
3403 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3406 information['filepath'] = new_path
3410 def updateSelf(downloader, filename):
3411 ''' Update the program file with the latest version from the repository '''
3412 # Note: downloader only used for options
3413 if not os.access(filename, os.W_OK):
3414 sys.exit('ERROR: no write permissions on %s' % filename)
3416 downloader.to_screen('Updating to latest version...')
3420 urlh = urllib.urlopen(UPDATE_URL)
3421 newcontent = urlh.read()
3424 except (IOError, OSError), err:
3425 sys.exit('ERROR: unable to download latest version')
3428 outf = open(filename, 'wb')
3430 outf.write(newcontent)
3433 except (IOError, OSError), err:
3434 sys.exit('ERROR: unable to overwrite current version')
3436 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3443 def _format_option_string(option):
3444 ''' ('-o', '--option') -> -o, --format METAVAR'''
3448 if option._short_opts: opts.append(option._short_opts[0])
3449 if option._long_opts: opts.append(option._long_opts[0])
3450 if len(opts) > 1: opts.insert(1, ', ')
3452 if option.takes_value(): opts.append(' %s' % option.metavar)
3454 return "".join(opts)
3456 def _find_term_columns():
3457 columns = os.environ.get('COLUMNS', None)
3462 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3463 out,err = sp.communicate()
3464 return int(out.split()[1])
3470 max_help_position = 80
3472 # No need to wrap help messages if we're on a wide console
3473 columns = _find_term_columns()
3474 if columns: max_width = columns
3476 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3477 fmt.format_option_strings = _format_option_string
3480 'version' : __version__,
3482 'usage' : '%prog [options] url [url...]',
3483 'conflict_handler' : 'resolve',
3486 parser = optparse.OptionParser(**kw)
3489 general = optparse.OptionGroup(parser, 'General Options')
3490 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3491 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3492 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3493 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3494 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3496 general.add_option('-h', '--help',
3497 action='help', help='print this help text and exit')
3498 general.add_option('-v', '--version',
3499 action='version', help='print program version and exit')
3500 general.add_option('-U', '--update',
3501 action='store_true', dest='update_self', help='update this program to latest version')
3502 general.add_option('-i', '--ignore-errors',
3503 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3504 general.add_option('-r', '--rate-limit',
3505 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3506 general.add_option('-R', '--retries',
3507 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3508 general.add_option('--playlist-start',
3509 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3510 general.add_option('--playlist-end',
3511 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3512 general.add_option('--dump-user-agent',
3513 action='store_true', dest='dump_user_agent',
3514 help='display the current browser identification', default=False)
3516 authentication.add_option('-u', '--username',
3517 dest='username', metavar='USERNAME', help='account username')
3518 authentication.add_option('-p', '--password',
3519 dest='password', metavar='PASSWORD', help='account password')
3520 authentication.add_option('-n', '--netrc',
3521 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3524 video_format.add_option('-f', '--format',
3525 action='store', dest='format', metavar='FORMAT', help='video format code')
3526 video_format.add_option('--all-formats',
3527 action='store_const', dest='format', help='download all available video formats', const='-1')
3528 video_format.add_option('--max-quality',
3529 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3532 verbosity.add_option('-q', '--quiet',
3533 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3534 verbosity.add_option('-s', '--simulate',
3535 action='store_true', dest='simulate', help='do not download video', default=False)
3536 verbosity.add_option('-g', '--get-url',
3537 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3538 verbosity.add_option('-e', '--get-title',
3539 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3540 verbosity.add_option('--get-thumbnail',
3541 action='store_true', dest='getthumbnail',
3542 help='simulate, quiet but print thumbnail URL', default=False)
3543 verbosity.add_option('--get-description',
3544 action='store_true', dest='getdescription',
3545 help='simulate, quiet but print video description', default=False)
3546 verbosity.add_option('--get-filename',
3547 action='store_true', dest='getfilename',
3548 help='simulate, quiet but print output filename', default=False)
3549 verbosity.add_option('--no-progress',
3550 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3551 verbosity.add_option('--console-title',
3552 action='store_true', dest='consoletitle',
3553 help='display progress in console titlebar', default=False)
3556 filesystem.add_option('-t', '--title',
3557 action='store_true', dest='usetitle', help='use title in file name', default=False)
3558 filesystem.add_option('-l', '--literal',
3559 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3560 filesystem.add_option('-A', '--auto-number',
3561 action='store_true', dest='autonumber',
3562 help='number downloaded files starting from 00000', default=False)
3563 filesystem.add_option('-o', '--output',
3564 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3565 filesystem.add_option('-a', '--batch-file',
3566 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3567 filesystem.add_option('-w', '--no-overwrites',
3568 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3569 filesystem.add_option('-c', '--continue',
3570 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3571 filesystem.add_option('--cookies',
3572 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3573 filesystem.add_option('--no-part',
3574 action='store_true', dest='nopart', help='do not use .part files', default=False)
3575 filesystem.add_option('--no-mtime',
3576 action='store_false', dest='updatetime',
3577 help='do not use the Last-modified header to set the file modification time', default=True)
3578 filesystem.add_option('--write-description',
3579 action='store_true', dest='writedescription',
3580 help='write video description to a .description file', default=False)
3581 filesystem.add_option('--write-info-json',
3582 action='store_true', dest='writeinfojson',
3583 help='write video metadata to a .info.json file', default=False)
3586 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3587 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3588 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3589 help='"best", "aac" or "mp3"; best by default')
3592 parser.add_option_group(general)
3593 parser.add_option_group(filesystem)
3594 parser.add_option_group(verbosity)
3595 parser.add_option_group(video_format)
3596 parser.add_option_group(authentication)
3597 parser.add_option_group(postproc)
3599 opts, args = parser.parse_args()
3601 return parser, opts, args
3604 parser, opts, args = parseOpts()
3606 # Open appropriate CookieJar
3607 if opts.cookiefile is None:
3608 jar = cookielib.CookieJar()
3611 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3612 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3614 except (IOError, OSError), err:
3615 sys.exit(u'ERROR: unable to open cookie file')
3618 if opts.dump_user_agent:
3619 print std_headers['User-Agent']
3622 # General configuration
3623 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3624 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3625 urllib2.install_opener(opener)
3626 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3628 # Batch file verification
3630 if opts.batchfile is not None:
3632 if opts.batchfile == '-':
3635 batchfd = open(opts.batchfile, 'r')
3636 batchurls = batchfd.readlines()
3637 batchurls = [x.strip() for x in batchurls]
3638 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3640 sys.exit(u'ERROR: batch file could not be read')
3641 all_urls = batchurls + args
3643 # Conflicting, missing and erroneous options
3644 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3645 parser.error(u'using .netrc conflicts with giving username/password')
3646 if opts.password is not None and opts.username is None:
3647 parser.error(u'account username missing')
3648 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3649 parser.error(u'using output template conflicts with using title, literal title or auto number')
3650 if opts.usetitle and opts.useliteral:
3651 parser.error(u'using title conflicts with using literal title')
3652 if opts.username is not None and opts.password is None:
3653 opts.password = getpass.getpass(u'Type account password and press return:')
3654 if opts.ratelimit is not None:
3655 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3656 if numeric_limit is None:
3657 parser.error(u'invalid rate limit specified')
3658 opts.ratelimit = numeric_limit
3659 if opts.retries is not None:
3661 opts.retries = long(opts.retries)
3662 except (TypeError, ValueError), err:
3663 parser.error(u'invalid retry count specified')
3665 opts.playliststart = int(opts.playliststart)
3666 if opts.playliststart <= 0:
3667 raise ValueError(u'Playlist start must be positive')
3668 except (TypeError, ValueError), err:
3669 parser.error(u'invalid playlist start number specified')
3671 opts.playlistend = int(opts.playlistend)
3672 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3673 raise ValueError(u'Playlist end must be greater than playlist start')
3674 except (TypeError, ValueError), err:
3675 parser.error(u'invalid playlist end number specified')
3676 if opts.extractaudio:
3677 if opts.audioformat not in ['best', 'aac', 'mp3']:
3678 parser.error(u'invalid audio format specified')
3680 # Information extractors
3681 youtube_ie = YoutubeIE()
3682 google_ie = GoogleIE()
3683 yahoo_ie = YahooIE()
3684 extractors = [ # Order does matter
3686 MetacafeIE(youtube_ie),
3688 YoutubePlaylistIE(youtube_ie),
3689 YoutubeUserIE(youtube_ie),
3690 YoutubeSearchIE(youtube_ie),
3692 GoogleSearchIE(google_ie),
3695 YahooSearchIE(yahoo_ie),
3708 fd = FileDownloader({
3709 'usenetrc': opts.usenetrc,
3710 'username': opts.username,
3711 'password': opts.password,
3712 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3713 'forceurl': opts.geturl,
3714 'forcetitle': opts.gettitle,
3715 'forcethumbnail': opts.getthumbnail,
3716 'forcedescription': opts.getdescription,
3717 'forcefilename': opts.getfilename,
3718 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3719 'format': opts.format,
3720 'format_limit': opts.format_limit,
3721 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3722 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3723 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3724 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3725 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3726 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3727 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3728 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3729 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3730 or u'%(id)s.%(ext)s'),
3731 'ignoreerrors': opts.ignoreerrors,
3732 'ratelimit': opts.ratelimit,
3733 'nooverwrites': opts.nooverwrites,
3734 'retries': opts.retries,
3735 'continuedl': opts.continue_dl,
3736 'noprogress': opts.noprogress,
3737 'playliststart': opts.playliststart,
3738 'playlistend': opts.playlistend,
3739 'logtostderr': opts.outtmpl == '-',
3740 'consoletitle': opts.consoletitle,
3741 'nopart': opts.nopart,
3742 'updatetime': opts.updatetime,
3743 'writedescription': opts.writedescription,
3744 'writeinfojson': opts.writeinfojson,
3746 for extractor in extractors:
3747 fd.add_info_extractor(extractor)
3750 if opts.extractaudio:
3751 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3754 if opts.update_self:
3755 updateSelf(fd, sys.argv[0])
3758 if len(all_urls) < 1:
3759 if not opts.update_self:
3760 parser.error(u'you must provide at least one URL')
3763 retcode = fd.download(all_urls)
3765 # Dump cookie jar if requested
3766 if opts.cookiefile is not None:
3769 except (IOError, OSError), err:
3770 sys.exit(u'ERROR: unable to save cookie jar')
3775 if __name__ == '__main__':
3778 except DownloadError:
3780 except SameFileError:
3781 sys.exit(u'ERROR: fixed output name but more than one file to download')
3782 except KeyboardInterrupt:
3783 sys.exit(u'\nERROR: Interrupted by user')
3785 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: