2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
46 compat_socket_create_connection,
51 compat_urllib_parse_urlencode,
52 compat_urllib_parse_urlparse,
53 compat_urllib_parse_unquote_plus,
54 compat_urllib_request,
65 def register_socks_protocols():
66 # "Register" SOCKS protocols
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
74 # This is not clearly defined otherwise
75 compiled_regex_type = type(re.compile(''))
78 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
79 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
80 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81 'Accept-Encoding': 'gzip, deflate',
82 'Accept-Language': 'en-us,en;q=0.5',
88 ENGLISH_MONTH_NAMES = [
89 'January', 'February', 'March', 'April', 'May', 'June',
90 'July', 'August', 'September', 'October', 'November', 'December']
93 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
94 'flv', 'f4v', 'f4a', 'f4b',
95 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
105 'f4f', 'f4m', 'm3u8', 'smil')
107 # needed for sanitizing filenames in restricted mode
108 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
109 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
110 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
113 def preferredencoding():
114 """Get preferred encoding.
116 Returns the best encoding scheme for the system, based on
117 locale.getpreferredencoding() and some further tweaks.
120 pref = locale.getpreferredencoding()
128 def write_json_file(obj, fn):
129 """ Encode obj as JSON and write it to fn, atomically if possible """
131 fn = encodeFilename(fn)
132 if sys.version_info < (3, 0) and sys.platform != 'win32':
133 encoding = get_filesystem_encoding()
134 # os.path.basename returns a bytes object, but NamedTemporaryFile
135 # will fail if the filename contains non ascii characters unless we
136 # use a unicode object
137 path_basename = lambda f: os.path.basename(fn).decode(encoding)
138 # the same for os.path.dirname
139 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 path_basename = os.path.basename
142 path_dirname = os.path.dirname
146 'prefix': path_basename(fn) + '.',
147 'dir': path_dirname(fn),
151 # In Python 2.x, json.dump expects a bytestream.
152 # In Python 3.x, it writes to a character stream
153 if sys.version_info < (3, 0):
161 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
166 if sys.platform == 'win32':
167 # Need to remove existing file on Windows, else os.rename raises
168 # WindowsError or FileExistsError.
173 os.rename(tf.name, fn)
182 if sys.version_info >= (2, 7):
183 def find_xpath_attr(node, xpath, key, val=None):
184 """ Find the xpath xpath[@key=val] """
185 assert re.match(r'^[a-zA-Z_-]+$', key)
186 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
187 return node.find(expr)
189 def find_xpath_attr(node, xpath, key, val=None):
190 for f in node.findall(compat_xpath(xpath)):
191 if key not in f.attrib:
193 if val is None or f.attrib.get(key) == val:
197 # On python2.6 the xml.etree.ElementTree.Element methods don't support
198 # the namespace parameter
201 def xpath_with_ns(path, ns_map):
202 components = [c.split(':') for c in path.split('/')]
206 replaced.append(c[0])
209 replaced.append('{%s}%s' % (ns_map[ns], tag))
210 return '/'.join(replaced)
213 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
214 def _find_xpath(xpath):
215 return node.find(compat_xpath(xpath))
217 if isinstance(xpath, (str, compat_str)):
218 n = _find_xpath(xpath)
226 if default is not NO_DEFAULT:
229 name = xpath if name is None else name
230 raise ExtractorError('Could not find XML element %s' % name)
236 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
237 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
238 if n is None or n == default:
241 if default is not NO_DEFAULT:
244 name = xpath if name is None else name
245 raise ExtractorError('Could not find XML element\'s text %s' % name)
251 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
252 n = find_xpath_attr(node, xpath, key)
254 if default is not NO_DEFAULT:
257 name = '%s[@%s]' % (xpath, key) if name is None else name
258 raise ExtractorError('Could not find XML attribute %s' % name)
264 def get_element_by_id(id, html):
265 """Return the content of the tag with the specified ID in the passed HTML document"""
266 return get_element_by_attribute('id', id, html)
269 def get_element_by_attribute(attribute, value, html):
270 """Return the content of the tag with the specified attribute in the passed HTML document"""
272 m = re.search(r'''(?xs)
274 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
276 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
280 ''' % (re.escape(attribute), re.escape(value)), html)
284 res = m.group('content')
286 if res.startswith('"') or res.startswith("'"):
289 return unescapeHTML(res)
292 class HTMLAttributeParser(compat_HTMLParser):
293 """Trivial HTML parser to gather the attributes for a single element"""
296 compat_HTMLParser.__init__(self)
298 def handle_starttag(self, tag, attrs):
299 self.attrs = dict(attrs)
302 def extract_attributes(html_element):
303 """Given a string for an HTML element such as
305 a="foo" B="bar" c="&98;az" d=boz
306 empty= noval entity="&"
309 Decode and return a dictionary of attributes.
311 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
312 'empty': '', 'noval': None, 'entity': '&',
313 'sq': '"', 'dq': '\''
315 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
316 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 parser = HTMLAttributeParser()
319 parser.feed(html_element)
324 def clean_html(html):
325 """Clean an HTML snippet into a readable string"""
327 if html is None: # Convenience for sanitizing descriptions etc.
331 html = html.replace('\n', ' ')
332 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
333 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
335 html = re.sub('<.*?>', '', html)
336 # Replace html entities
337 html = unescapeHTML(html)
341 def sanitize_open(filename, open_mode):
342 """Try to open the given filename, and slightly tweak it if this fails.
344 Attempts to open the given filename. If this fails, it tries to change
345 the filename slightly, step by step, until it's either able to open it
346 or it fails and raises a final exception, like the standard open()
349 It returns the tuple (stream, definitive_file_name).
353 if sys.platform == 'win32':
355 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
356 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
357 stream = open(encodeFilename(filename), open_mode)
358 return (stream, filename)
359 except (IOError, OSError) as err:
360 if err.errno in (errno.EACCES,):
363 # In case of error, try to remove win32 forbidden chars
364 alt_filename = sanitize_path(filename)
365 if alt_filename == filename:
368 # An exception here should be caught in the caller
369 stream = open(encodeFilename(alt_filename), open_mode)
370 return (stream, alt_filename)
373 def timeconvert(timestr):
374 """Convert RFC 2822 defined time string into system timestamp"""
376 timetuple = email.utils.parsedate_tz(timestr)
377 if timetuple is not None:
378 timestamp = email.utils.mktime_tz(timetuple)
382 def sanitize_filename(s, restricted=False, is_id=False):
383 """Sanitizes a string so it could be used as part of a filename.
384 If restricted is set, use a stricter subset of allowed characters.
385 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
387 def replace_insane(char):
388 if restricted and char in ACCENT_CHARS:
389 return ACCENT_CHARS[char]
390 if char == '?' or ord(char) < 32 or ord(char) == 127:
393 return '' if restricted else '\''
395 return '_-' if restricted else ' -'
396 elif char in '\\/|*<>':
398 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
400 if restricted and ord(char) > 127:
405 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
406 result = ''.join(map(replace_insane, s))
408 while '__' in result:
409 result = result.replace('__', '_')
410 result = result.strip('_')
411 # Common case of "Foreign band name - English song title"
412 if restricted and result.startswith('-_'):
414 if result.startswith('-'):
415 result = '_' + result[len('-'):]
416 result = result.lstrip('.')
422 def sanitize_path(s):
423 """Sanitizes and normalizes path on Windows"""
424 if sys.platform != 'win32':
426 drive_or_unc, _ = os.path.splitdrive(s)
427 if sys.version_info < (2, 7) and not drive_or_unc:
428 drive_or_unc, _ = os.path.splitunc(s)
429 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
433 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
434 for path_part in norm_path]
436 sanitized_path.insert(0, drive_or_unc + os.path.sep)
437 return os.path.join(*sanitized_path)
440 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
441 # unwanted failures due to missing protocol
442 def sanitize_url(url):
443 return 'http:%s' % url if url.startswith('//') else url
446 def sanitized_Request(url, *args, **kwargs):
447 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
450 def orderedSet(iterable):
451 """ Remove all duplicates from the input iterable """
459 def _htmlentity_transform(entity):
460 """Transforms an HTML entity to a character."""
461 # Known non-numeric HTML entity
462 if entity in compat_html_entities.name2codepoint:
463 return compat_chr(compat_html_entities.name2codepoint[entity])
465 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
467 numstr = mobj.group(1)
468 if numstr.startswith('x'):
470 numstr = '0%s' % numstr
473 # See https://github.com/rg3/youtube-dl/issues/7518
475 return compat_chr(int(numstr, base))
479 # Unknown entity in name, return its literal representation
480 return '&%s;' % entity
486 assert type(s) == compat_str
489 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
492 def get_subprocess_encoding():
493 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 # For subprocess calls, encode with locale encoding
495 # Refer to http://stackoverflow.com/a/9951851/35070
496 encoding = preferredencoding()
498 encoding = sys.getfilesystemencoding()
504 def encodeFilename(s, for_subprocess=False):
506 @param s The name of the file
509 assert type(s) == compat_str
511 # Python 3 has a Unicode API
512 if sys.version_info >= (3, 0):
515 # Pass '' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
521 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
522 if sys.platform.startswith('java'):
525 return s.encode(get_subprocess_encoding(), 'ignore')
528 def decodeFilename(b, for_subprocess=False):
530 if sys.version_info >= (3, 0):
533 if not isinstance(b, bytes):
536 return b.decode(get_subprocess_encoding(), 'ignore')
539 def encodeArgument(s):
540 if not isinstance(s, compat_str):
541 # Legacy code that uses byte strings
542 # Uncomment the following line after fixing all post processors
543 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
544 s = s.decode('ascii')
545 return encodeFilename(s, True)
548 def decodeArgument(b):
549 return decodeFilename(b, True)
552 def decodeOption(optval):
555 if isinstance(optval, bytes):
556 optval = optval.decode(preferredencoding())
558 assert isinstance(optval, compat_str)
562 def formatSeconds(secs):
564 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
566 return '%d:%02d' % (secs // 60, secs % 60)
571 def make_HTTPS_handler(params, **kwargs):
572 opts_no_check_certificate = params.get('nocheckcertificate', False)
573 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
574 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
575 if opts_no_check_certificate:
576 context.check_hostname = False
577 context.verify_mode = ssl.CERT_NONE
579 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
582 # (create_default_context present but HTTPSHandler has no context=)
585 if sys.version_info < (3, 2):
586 return YoutubeDLHTTPSHandler(params, **kwargs)
588 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
589 context.verify_mode = (ssl.CERT_NONE
590 if opts_no_check_certificate
591 else ssl.CERT_REQUIRED)
592 context.set_default_verify_paths()
593 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
596 def bug_reports_message():
597 if ytdl_is_updateable():
598 update_cmd = 'type youtube-dl -U to update'
600 update_cmd = 'see https://yt-dl.org/update on how to update'
601 msg = '; please report this issue on https://yt-dl.org/bug .'
602 msg += ' Make sure you are using the latest version; %s.' % update_cmd
603 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
607 class ExtractorError(Exception):
608 """Error during info extraction."""
610 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
611 """ tb, if given, is the original traceback (so that it can be printed out).
612 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
615 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
617 if video_id is not None:
618 msg = video_id + ': ' + msg
620 msg += ' (caused by %r)' % cause
622 msg += bug_reports_message()
623 super(ExtractorError, self).__init__(msg)
626 self.exc_info = sys.exc_info() # preserve original exception
628 self.video_id = video_id
630 def format_traceback(self):
631 if self.traceback is None:
633 return ''.join(traceback.format_tb(self.traceback))
636 class UnsupportedError(ExtractorError):
637 def __init__(self, url):
638 super(UnsupportedError, self).__init__(
639 'Unsupported URL: %s' % url, expected=True)
643 class RegexNotFoundError(ExtractorError):
644 """Error when a regex didn't match"""
648 class DownloadError(Exception):
649 """Download Error exception.
651 This exception may be thrown by FileDownloader objects if they are not
652 configured to continue on errors. They will contain the appropriate
656 def __init__(self, msg, exc_info=None):
657 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
658 super(DownloadError, self).__init__(msg)
659 self.exc_info = exc_info
662 class SameFileError(Exception):
663 """Same File exception.
665 This exception will be thrown by FileDownloader objects if they detect
666 multiple files would have to be downloaded to the same file on disk.
671 class PostProcessingError(Exception):
672 """Post Processing exception.
674 This exception may be raised by PostProcessor's .run() method to
675 indicate an error in the postprocessing task.
678 def __init__(self, msg):
682 class MaxDownloadsReached(Exception):
683 """ --max-downloads limit has been reached. """
687 class UnavailableVideoError(Exception):
688 """Unavailable Format exception.
690 This exception will be thrown when a video is requested
691 in a format that is not available for that video.
696 class ContentTooShortError(Exception):
697 """Content Too Short exception.
699 This exception may be raised by FileDownloader objects when a file they
700 download is too small for what the server announced first, indicating
701 the connection was probably interrupted.
704 def __init__(self, downloaded, expected):
706 self.downloaded = downloaded
707 self.expected = expected
710 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
711 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
712 # expected HTTP responses to meet HTTP/1.0 or later (see also
713 # https://github.com/rg3/youtube-dl/issues/6727)
714 if sys.version_info < (3, 0):
715 kwargs[b'strict'] = True
716 hc = http_class(*args, **kwargs)
717 source_address = ydl_handler._params.get('source_address')
718 if source_address is not None:
719 sa = (source_address, 0)
720 if hasattr(hc, 'source_address'): # Python 2.7+
721 hc.source_address = sa
723 def _hc_connect(self, *args, **kwargs):
724 sock = compat_socket_create_connection(
725 (self.host, self.port), self.timeout, sa)
727 self.sock = ssl.wrap_socket(
728 sock, self.key_file, self.cert_file,
729 ssl_version=ssl.PROTOCOL_TLSv1)
732 hc.connect = functools.partial(_hc_connect, hc)
737 def handle_youtubedl_headers(headers):
738 filtered_headers = headers
740 if 'Youtubedl-no-compression' in filtered_headers:
741 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
742 del filtered_headers['Youtubedl-no-compression']
744 return filtered_headers
747 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
748 """Handler for HTTP requests and responses.
750 This class, when installed with an OpenerDirector, automatically adds
751 the standard headers to every HTTP request and handles gzipped and
752 deflated responses from web servers. If compression is to be avoided in
753 a particular request, the original request in the program code only has
754 to include the HTTP header "Youtubedl-no-compression", which will be
755 removed before making the real request.
757 Part of this code was copied from:
759 http://techknack.net/python-urllib2-handlers/
761 Andrew Rowls, the author of that code, agreed to release it to the
765 def __init__(self, params, *args, **kwargs):
766 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
767 self._params = params
769 def http_open(self, req):
770 conn_class = compat_http_client.HTTPConnection
772 socks_proxy = req.headers.get('Ytdl-socks-proxy')
774 conn_class = make_socks_conn_class(conn_class, socks_proxy)
775 del req.headers['Ytdl-socks-proxy']
777 return self.do_open(functools.partial(
778 _create_http_connection, self, conn_class, False),
784 return zlib.decompress(data, -zlib.MAX_WBITS)
786 return zlib.decompress(data)
789 def addinfourl_wrapper(stream, headers, url, code):
790 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
791 return compat_urllib_request.addinfourl(stream, headers, url, code)
792 ret = compat_urllib_request.addinfourl(stream, headers, url)
796 def http_request(self, req):
797 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
798 # always respected by websites, some tend to give out URLs with non percent-encoded
799 # non-ASCII characters (see telemb.py, ard.py [#3412])
800 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
801 # To work around aforementioned issue we will replace request's original URL with
802 # percent-encoded one
803 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
804 # the code of this workaround has been moved here from YoutubeDL.urlopen()
805 url = req.get_full_url()
806 url_escaped = escape_url(url)
808 # Substitute URL if any change after escaping
809 if url != url_escaped:
810 req = update_Request(req, url=url_escaped)
812 for h, v in std_headers.items():
813 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
814 # The dict keys are capitalized because of this bug by urllib
815 if h.capitalize() not in req.headers:
818 req.headers = handle_youtubedl_headers(req.headers)
820 if sys.version_info < (2, 7) and '#' in req.get_full_url():
821 # Python 2.6 is brain-dead when it comes to fragments
822 req._Request__original = req._Request__original.partition('#')[0]
823 req._Request__r_type = req._Request__r_type.partition('#')[0]
827 def http_response(self, req, resp):
830 if resp.headers.get('Content-encoding', '') == 'gzip':
831 content = resp.read()
832 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
834 uncompressed = io.BytesIO(gz.read())
835 except IOError as original_ioerror:
836 # There may be junk add the end of the file
837 # See http://stackoverflow.com/q/4928560/35070 for details
838 for i in range(1, 1024):
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
841 uncompressed = io.BytesIO(gz.read())
846 raise original_ioerror
847 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
848 resp.msg = old_resp.msg
849 del resp.headers['Content-encoding']
851 if resp.headers.get('Content-encoding', '') == 'deflate':
852 gz = io.BytesIO(self.deflate(resp.read()))
853 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
854 resp.msg = old_resp.msg
855 del resp.headers['Content-encoding']
856 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
857 # https://github.com/rg3/youtube-dl/issues/6457).
858 if 300 <= resp.code < 400:
859 location = resp.headers.get('Location')
861 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
862 if sys.version_info >= (3, 0):
863 location = location.encode('iso-8859-1').decode('utf-8')
865 location = location.decode('utf-8')
866 location_escaped = escape_url(location)
867 if location != location_escaped:
868 del resp.headers['Location']
869 resp.headers['Location'] = location_escaped
872 https_request = http_request
873 https_response = http_response
876 def make_socks_conn_class(base_class, socks_proxy):
877 assert issubclass(base_class, (
878 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
880 url_components = compat_urlparse.urlparse(socks_proxy)
881 if url_components.scheme.lower() == 'socks5':
882 socks_type = ProxyType.SOCKS5
883 elif url_components.scheme.lower() in ('socks', 'socks4'):
884 socks_type = ProxyType.SOCKS4
885 elif url_components.scheme.lower() == 'socks4a':
886 socks_type = ProxyType.SOCKS4A
888 def unquote_if_non_empty(s):
891 return compat_urllib_parse_unquote_plus(s)
895 url_components.hostname, url_components.port or 1080,
897 unquote_if_non_empty(url_components.username),
898 unquote_if_non_empty(url_components.password),
901 class SocksConnection(base_class):
903 self.sock = sockssocket()
904 self.sock.setproxy(*proxy_args)
905 if type(self.timeout) in (int, float):
906 self.sock.settimeout(self.timeout)
907 self.sock.connect((self.host, self.port))
909 if isinstance(self, compat_http_client.HTTPSConnection):
910 if hasattr(self, '_context'): # Python > 2.6
911 self.sock = self._context.wrap_socket(
912 self.sock, server_hostname=self.host)
914 self.sock = ssl.wrap_socket(self.sock)
916 return SocksConnection
919 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
920 def __init__(self, params, https_conn_class=None, *args, **kwargs):
921 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
922 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
923 self._params = params
925 def https_open(self, req):
927 conn_class = self._https_conn_class
929 if hasattr(self, '_context'): # python > 2.6
930 kwargs['context'] = self._context
931 if hasattr(self, '_check_hostname'): # python 3.x
932 kwargs['check_hostname'] = self._check_hostname
934 socks_proxy = req.headers.get('Ytdl-socks-proxy')
936 conn_class = make_socks_conn_class(conn_class, socks_proxy)
937 del req.headers['Ytdl-socks-proxy']
939 return self.do_open(functools.partial(
940 _create_http_connection, self, conn_class, True),
944 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
945 def __init__(self, cookiejar=None):
946 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
948 def http_response(self, request, response):
949 # Python 2 will choke on next HTTP request in row if there are non-ASCII
950 # characters in Set-Cookie HTTP header of last response (see
951 # https://github.com/rg3/youtube-dl/issues/6769).
952 # In order to at least prevent crashing we will percent encode Set-Cookie
953 # header before HTTPCookieProcessor starts processing it.
954 # if sys.version_info < (3, 0) and response.headers:
955 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
956 # set_cookie = response.headers.get(set_cookie_header)
958 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
959 # if set_cookie != set_cookie_escaped:
960 # del response.headers[set_cookie_header]
961 # response.headers[set_cookie_header] = set_cookie_escaped
962 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
964 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
965 https_response = http_response
968 def parse_iso8601(date_str, delimiter='T', timezone=None):
969 """ Return a UNIX timestamp from the given date """
974 date_str = re.sub(r'\.[0-9]+', '', date_str)
978 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
981 timezone = datetime.timedelta()
983 date_str = date_str[:-len(m.group(0))]
984 if not m.group('sign'):
985 timezone = datetime.timedelta()
987 sign = 1 if m.group('sign') == '+' else -1
988 timezone = datetime.timedelta(
989 hours=sign * int(m.group('hours')),
990 minutes=sign * int(m.group('minutes')))
992 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
993 dt = datetime.datetime.strptime(date_str, date_format) - timezone
994 return calendar.timegm(dt.timetuple())
999 def unified_strdate(date_str, day_first=True):
1000 """Return a string with the date in the format YYYYMMDD"""
1002 if date_str is None:
1006 date_str = date_str.replace(',', ' ')
1007 # %z (UTC offset) is only supported in python>=3.2
1008 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1009 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
1010 # Remove AM/PM + timezone
1011 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1013 format_expressions = [
1024 '%Y/%m/%d %H:%M:%S',
1025 '%Y-%m-%d %H:%M:%S',
1026 '%Y-%m-%d %H:%M:%S.%f',
1029 '%Y-%m-%dT%H:%M:%SZ',
1030 '%Y-%m-%dT%H:%M:%S.%fZ',
1031 '%Y-%m-%dT%H:%M:%S.%f0Z',
1032 '%Y-%m-%dT%H:%M:%S',
1033 '%Y-%m-%dT%H:%M:%S.%f',
1037 format_expressions.extend([
1043 '%d/%m/%Y %H:%M:%S',
1046 format_expressions.extend([
1051 '%m/%d/%Y %H:%M:%S',
1053 for expression in format_expressions:
1055 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1058 if upload_date is None:
1059 timetuple = email.utils.parsedate_tz(date_str)
1062 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1065 if upload_date is not None:
1066 return compat_str(upload_date)
1069 def determine_ext(url, default_ext='unknown_video'):
1072 guess = url.partition('?')[0].rpartition('.')[2]
1073 if re.match(r'^[A-Za-z0-9]+$', guess):
1075 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1076 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1077 return guess.rstrip('/')
1082 def subtitles_filename(filename, sub_lang, sub_format):
1083 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1086 def date_from_str(date_str):
1088 Return a datetime object from a string in the format YYYYMMDD or
1089 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1090 today = datetime.date.today()
1091 if date_str in ('now', 'today'):
1093 if date_str == 'yesterday':
1094 return today - datetime.timedelta(days=1)
1095 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1096 if match is not None:
1097 sign = match.group('sign')
1098 time = int(match.group('time'))
1101 unit = match.group('unit')
1102 # A bad approximation?
1106 elif unit == 'year':
1110 delta = datetime.timedelta(**{unit: time})
1111 return today + delta
1112 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1115 def hyphenate_date(date_str):
1117 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1118 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1119 if match is not None:
1120 return '-'.join(match.groups())
1125 class DateRange(object):
1126 """Represents a time interval between two dates"""
1128 def __init__(self, start=None, end=None):
1129 """start and end must be strings in the format accepted by date"""
1130 if start is not None:
1131 self.start = date_from_str(start)
1133 self.start = datetime.datetime.min.date()
1135 self.end = date_from_str(end)
1137 self.end = datetime.datetime.max.date()
1138 if self.start > self.end:
1139 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1143 """Returns a range that only contains the given day"""
1144 return cls(day, day)
1146 def __contains__(self, date):
1147 """Check if the date is in the range"""
1148 if not isinstance(date, datetime.date):
1149 date = date_from_str(date)
1150 return self.start <= date <= self.end
1153 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1156 def platform_name():
1157 """ Returns the platform name as a compat_str """
1158 res = platform.platform()
1159 if isinstance(res, bytes):
1160 res = res.decode(preferredencoding())
1162 assert isinstance(res, compat_str)
1166 def _windows_write_string(s, out):
1167 """ Returns True if the string was written using special methods,
1168 False if it has yet to be written out."""
1169 # Adapted from http://stackoverflow.com/a/3259271/35070
1172 import ctypes.wintypes
1180 fileno = out.fileno()
1181 except AttributeError:
1182 # If the output stream doesn't have a fileno, it's virtual
1184 except io.UnsupportedOperation:
1185 # Some strange Windows pseudo files?
1187 if fileno not in WIN_OUTPUT_IDS:
1190 GetStdHandle = ctypes.WINFUNCTYPE(
1191 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1192 (b'GetStdHandle', ctypes.windll.kernel32))
1193 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1195 WriteConsoleW = ctypes.WINFUNCTYPE(
1196 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1197 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1198 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1199 written = ctypes.wintypes.DWORD(0)
1201 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1202 FILE_TYPE_CHAR = 0x0002
1203 FILE_TYPE_REMOTE = 0x8000
1204 GetConsoleMode = ctypes.WINFUNCTYPE(
1205 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1206 ctypes.POINTER(ctypes.wintypes.DWORD))(
1207 (b'GetConsoleMode', ctypes.windll.kernel32))
1208 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1210 def not_a_console(handle):
1211 if handle == INVALID_HANDLE_VALUE or handle is None:
1213 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1214 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1216 if not_a_console(h):
1219 def next_nonbmp_pos(s):
1221 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1222 except StopIteration:
1226 count = min(next_nonbmp_pos(s), 1024)
1228 ret = WriteConsoleW(
1229 h, s, count if count else 2, ctypes.byref(written), None)
1231 raise OSError('Failed to write string')
1232 if not count: # We just wrote a non-BMP character
1233 assert written.value == 2
1236 assert written.value > 0
1237 s = s[written.value:]
1241 def write_string(s, out=None, encoding=None):
1244 assert type(s) == compat_str
1246 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1247 if _windows_write_string(s, out):
1250 if ('b' in getattr(out, 'mode', '') or
1251 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1252 byt = s.encode(encoding or preferredencoding(), 'ignore')
1254 elif hasattr(out, 'buffer'):
1255 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1256 byt = s.encode(enc, 'ignore')
1257 out.buffer.write(byt)
1263 def bytes_to_intlist(bs):
1266 if isinstance(bs[0], int): # Python 3
1269 return [ord(c) for c in bs]
1272 def intlist_to_bytes(xs):
1275 return compat_struct_pack('%dB' % len(xs), *xs)
1278 # Cross-platform file locking
1279 if sys.platform == 'win32':
1280 import ctypes.wintypes
1283 class OVERLAPPED(ctypes.Structure):
1285 ('Internal', ctypes.wintypes.LPVOID),
1286 ('InternalHigh', ctypes.wintypes.LPVOID),
1287 ('Offset', ctypes.wintypes.DWORD),
1288 ('OffsetHigh', ctypes.wintypes.DWORD),
1289 ('hEvent', ctypes.wintypes.HANDLE),
1292 kernel32 = ctypes.windll.kernel32
1293 LockFileEx = kernel32.LockFileEx
1294 LockFileEx.argtypes = [
1295 ctypes.wintypes.HANDLE, # hFile
1296 ctypes.wintypes.DWORD, # dwFlags
1297 ctypes.wintypes.DWORD, # dwReserved
1298 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1299 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1300 ctypes.POINTER(OVERLAPPED) # Overlapped
1302 LockFileEx.restype = ctypes.wintypes.BOOL
1303 UnlockFileEx = kernel32.UnlockFileEx
1304 UnlockFileEx.argtypes = [
1305 ctypes.wintypes.HANDLE, # hFile
1306 ctypes.wintypes.DWORD, # dwReserved
1307 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1308 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1309 ctypes.POINTER(OVERLAPPED) # Overlapped
1311 UnlockFileEx.restype = ctypes.wintypes.BOOL
1312 whole_low = 0xffffffff
1313 whole_high = 0x7fffffff
1315 def _lock_file(f, exclusive):
1316 overlapped = OVERLAPPED()
1317 overlapped.Offset = 0
1318 overlapped.OffsetHigh = 0
1319 overlapped.hEvent = 0
1320 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1321 handle = msvcrt.get_osfhandle(f.fileno())
1322 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1323 whole_low, whole_high, f._lock_file_overlapped_p):
1324 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1326 def _unlock_file(f):
1327 assert f._lock_file_overlapped_p
1328 handle = msvcrt.get_osfhandle(f.fileno())
1329 if not UnlockFileEx(handle, 0,
1330 whole_low, whole_high, f._lock_file_overlapped_p):
1331 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1334 # Some platforms, such as Jython, is missing fcntl
1338 def _lock_file(f, exclusive):
1339 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1341 def _unlock_file(f):
1342 fcntl.flock(f, fcntl.LOCK_UN)
1344 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1346 def _lock_file(f, exclusive):
1347 raise IOError(UNSUPPORTED_MSG)
1349 def _unlock_file(f):
1350 raise IOError(UNSUPPORTED_MSG)
1353 class locked_file(object):
1354 def __init__(self, filename, mode, encoding=None):
1355 assert mode in ['r', 'a', 'w']
1356 self.f = io.open(filename, mode, encoding=encoding)
1359 def __enter__(self):
1360 exclusive = self.mode != 'r'
1362 _lock_file(self.f, exclusive)
1368 def __exit__(self, etype, value, traceback):
1370 _unlock_file(self.f)
1377 def write(self, *args):
1378 return self.f.write(*args)
1380 def read(self, *args):
1381 return self.f.read(*args)
1384 def get_filesystem_encoding():
1385 encoding = sys.getfilesystemencoding()
1386 return encoding if encoding is not None else 'utf-8'
1389 def shell_quote(args):
1391 encoding = get_filesystem_encoding()
1393 if isinstance(a, bytes):
1394 # We may get a filename encoded with 'encodeFilename'
1395 a = a.decode(encoding)
1396 quoted_args.append(pipes.quote(a))
1397 return ' '.join(quoted_args)
1400 def smuggle_url(url, data):
1401 """ Pass additional data in a URL for internal use. """
1403 sdata = compat_urllib_parse_urlencode(
1404 {'__youtubedl_smuggle': json.dumps(data)})
1405 return url + '#' + sdata
1408 def unsmuggle_url(smug_url, default=None):
1409 if '#__youtubedl_smuggle' not in smug_url:
1410 return smug_url, default
1411 url, _, sdata = smug_url.rpartition('#')
1412 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1413 data = json.loads(jsond)
1417 def format_bytes(bytes):
1420 if type(bytes) is str:
1421 bytes = float(bytes)
1425 exponent = int(math.log(bytes, 1024.0))
1426 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1427 converted = float(bytes) / float(1024 ** exponent)
1428 return '%.2f%s' % (converted, suffix)
1431 def lookup_unit_table(unit_table, s):
1432 units_re = '|'.join(re.escape(u) for u in unit_table)
1434 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1437 num_str = m.group('num').replace(',', '.')
1438 mult = unit_table[m.group('unit')]
1439 return int(float(num_str) * mult)
1442 def parse_filesize(s):
1446 # The lower-case forms are of course incorrect and unofficial,
1447 # but we support those too
1485 return lookup_unit_table(_UNIT_TABLE, s)
1494 if re.match(r'^[\d,.]+$', s):
1495 return str_to_int(s)
1506 return lookup_unit_table(_UNIT_TABLE, s)
1509 def month_by_name(name):
1510 """ Return the number of a month by (locale-independently) English name """
1513 return ENGLISH_MONTH_NAMES.index(name) + 1
1518 def month_by_abbreviation(abbrev):
1519 """ Return the number of a month by (locale-independently) English
1523 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1528 def fix_xml_ampersands(xml_str):
1529 """Replace all the '&' by '&' in XML"""
1531 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1536 def setproctitle(title):
1537 assert isinstance(title, compat_str)
1539 # ctypes in Jython is not complete
1540 # http://bugs.jython.org/issue2148
1541 if sys.platform.startswith('java'):
1545 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1548 title_bytes = title.encode('utf-8')
1549 buf = ctypes.create_string_buffer(len(title_bytes))
1550 buf.value = title_bytes
1552 libc.prctl(15, buf, 0, 0, 0)
1553 except AttributeError:
1554 return # Strange libc, just skip this
1557 def remove_start(s, start):
1558 return s[len(start):] if s is not None and s.startswith(start) else s
1561 def remove_end(s, end):
1562 return s[:-len(end)] if s is not None and s.endswith(end) else s
1565 def remove_quotes(s):
1566 if s is None or len(s) < 2:
1568 for quote in ('"', "'", ):
1569 if s[0] == quote and s[-1] == quote:
1574 def url_basename(url):
1575 path = compat_urlparse.urlparse(url).path
1576 return path.strip('/').split('/')[-1]
1579 class HEADRequest(compat_urllib_request.Request):
1580 def get_method(self):
1584 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1587 v = getattr(v, get_attr, None)
1593 return int(v) * invscale // scale
1598 def str_or_none(v, default=None):
1599 return default if v is None else compat_str(v)
1602 def str_to_int(int_str):
1603 """ A more relaxed version of int_or_none """
1606 int_str = re.sub(r'[,\.\+]', '', int_str)
1610 def float_or_none(v, scale=1, invscale=1, default=None):
1614 return float(v) * invscale / scale
1619 def parse_duration(s):
1620 if not isinstance(s, compat_basestring):
1625 days, hours, mins, secs, ms = [None] * 5
1626 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1628 days, hours, mins, secs, ms = m.groups()
1633 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1636 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1639 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1642 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1645 days, hours, mins, secs, ms = m.groups()
1647 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1649 hours, mins = m.groups()
1655 duration += float(secs)
1657 duration += float(mins) * 60
1659 duration += float(hours) * 60 * 60
1661 duration += float(days) * 24 * 60 * 60
1663 duration += float(ms)
1667 def prepend_extension(filename, ext, expected_real_ext=None):
1668 name, real_ext = os.path.splitext(filename)
1670 '{0}.{1}{2}'.format(name, ext, real_ext)
1671 if not expected_real_ext or real_ext[1:] == expected_real_ext
1672 else '{0}.{1}'.format(filename, ext))
1675 def replace_extension(filename, ext, expected_real_ext=None):
1676 name, real_ext = os.path.splitext(filename)
1677 return '{0}.{1}'.format(
1678 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1682 def check_executable(exe, args=[]):
1683 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1684 args can be a list of arguments for a short output (like -version) """
1686 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1692 def get_exe_version(exe, args=['--version'],
1693 version_re=None, unrecognized='present'):
1694 """ Returns the version of the specified executable,
1695 or False if the executable is not present """
1697 out, _ = subprocess.Popen(
1698 [encodeArgument(exe)] + args,
1699 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1702 if isinstance(out, bytes): # Python 2.x
1703 out = out.decode('ascii', 'ignore')
1704 return detect_exe_version(out, version_re, unrecognized)
1707 def detect_exe_version(output, version_re=None, unrecognized='present'):
1708 assert isinstance(output, compat_str)
1709 if version_re is None:
1710 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1711 m = re.search(version_re, output)
1718 class PagedList(object):
1720 # This is only useful for tests
1721 return len(self.getslice())
1724 class OnDemandPagedList(PagedList):
1725 def __init__(self, pagefunc, pagesize, use_cache=False):
1726 self._pagefunc = pagefunc
1727 self._pagesize = pagesize
1728 self._use_cache = use_cache
1732 def getslice(self, start=0, end=None):
1734 for pagenum in itertools.count(start // self._pagesize):
1735 firstid = pagenum * self._pagesize
1736 nextfirstid = pagenum * self._pagesize + self._pagesize
1737 if start >= nextfirstid:
1742 page_results = self._cache.get(pagenum)
1743 if page_results is None:
1744 page_results = list(self._pagefunc(pagenum))
1746 self._cache[pagenum] = page_results
1749 start % self._pagesize
1750 if firstid <= start < nextfirstid
1754 ((end - 1) % self._pagesize) + 1
1755 if (end is not None and firstid <= end <= nextfirstid)
1758 if startv != 0 or endv is not None:
1759 page_results = page_results[startv:endv]
1760 res.extend(page_results)
1762 # A little optimization - if current page is not "full", ie. does
1763 # not contain page_size videos then we can assume that this page
1764 # is the last one - there are no more ids on further pages -
1765 # i.e. no need to query again.
1766 if len(page_results) + startv < self._pagesize:
1769 # If we got the whole page, but the next page is not interesting,
1770 # break out early as well
1771 if end == nextfirstid:
1776 class InAdvancePagedList(PagedList):
1777 def __init__(self, pagefunc, pagecount, pagesize):
1778 self._pagefunc = pagefunc
1779 self._pagecount = pagecount
1780 self._pagesize = pagesize
1782 def getslice(self, start=0, end=None):
1784 start_page = start // self._pagesize
1786 self._pagecount if end is None else (end // self._pagesize + 1))
1787 skip_elems = start - start_page * self._pagesize
1788 only_more = None if end is None else end - start
1789 for pagenum in range(start_page, end_page):
1790 page = list(self._pagefunc(pagenum))
1792 page = page[skip_elems:]
1794 if only_more is not None:
1795 if len(page) < only_more:
1796 only_more -= len(page)
1798 page = page[:only_more]
1805 def uppercase_escape(s):
1806 unicode_escape = codecs.getdecoder('unicode_escape')
1808 r'\\U[0-9a-fA-F]{8}',
1809 lambda m: unicode_escape(m.group(0))[0],
1813 def lowercase_escape(s):
1814 unicode_escape = codecs.getdecoder('unicode_escape')
1816 r'\\u[0-9a-fA-F]{4}',
1817 lambda m: unicode_escape(m.group(0))[0],
1821 def escape_rfc3986(s):
1822 """Escape non-ASCII characters as suggested by RFC 3986"""
1823 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1824 s = s.encode('utf-8')
1825 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1828 def escape_url(url):
1829 """Escape URL as suggested by RFC 3986"""
1830 url_parsed = compat_urllib_parse_urlparse(url)
1831 return url_parsed._replace(
1832 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1833 path=escape_rfc3986(url_parsed.path),
1834 params=escape_rfc3986(url_parsed.params),
1835 query=escape_rfc3986(url_parsed.query),
1836 fragment=escape_rfc3986(url_parsed.fragment)
1840 def read_batch_urls(batch_fd):
1842 if not isinstance(url, compat_str):
1843 url = url.decode('utf-8', 'replace')
1844 BOM_UTF8 = '\xef\xbb\xbf'
1845 if url.startswith(BOM_UTF8):
1846 url = url[len(BOM_UTF8):]
1848 if url.startswith(('#', ';', ']')):
1852 with contextlib.closing(batch_fd) as fd:
1853 return [url for url in map(fixup, fd) if url]
1856 def urlencode_postdata(*args, **kargs):
1857 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1860 def update_url_query(url, query):
1863 parsed_url = compat_urlparse.urlparse(url)
1864 qs = compat_parse_qs(parsed_url.query)
1866 return compat_urlparse.urlunparse(parsed_url._replace(
1867 query=compat_urllib_parse_urlencode(qs, True)))
1870 def update_Request(req, url=None, data=None, headers={}, query={}):
1871 req_headers = req.headers.copy()
1872 req_headers.update(headers)
1873 req_data = data or req.data
1874 req_url = update_url_query(url or req.get_full_url(), query)
1875 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1877 req_url, data=req_data, headers=req_headers,
1878 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1879 if hasattr(req, 'timeout'):
1880 new_req.timeout = req.timeout
1884 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1885 if isinstance(key_or_keys, (list, tuple)):
1886 for key in key_or_keys:
1887 if key not in d or d[key] is None or skip_false_values and not d[key]:
1891 return d.get(key_or_keys, default)
1894 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1895 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1907 def parse_age_limit(s):
1910 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1911 return int(m.group('age')) if m else US_RATINGS.get(s)
1914 def strip_jsonp(code):
1916 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1919 def js_to_json(code):
1922 if v in ('true', 'false', 'null'):
1924 elif v.startswith('/*') or v == ',':
1927 if v[0] in ("'", '"'):
1928 v = re.sub(r'(?s)\\.|"', lambda m: {
1933 }.get(m.group(0), m.group(0)), v[1:-1])
1936 (r'^0[xX][0-9a-fA-F]+', 16),
1940 for regex, base in INTEGER_TABLE:
1941 im = re.match(regex, v)
1943 i = int(im.group(0), base)
1944 return '"%d":' % i if v.endswith(':') else '%d' % i
1948 return re.sub(r'''(?sx)
1949 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1950 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1951 /\*.*?\*/|,(?=\s*[\]}])|
1952 [a-zA-Z_][.a-zA-Z_0-9]*|
1953 (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
1958 def qualities(quality_ids):
1959 """ Get a numeric quality value out of a list of possible values """
1962 return quality_ids.index(qid)
1968 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1971 def limit_length(s, length):
1972 """ Add ellipses to overly long strings """
1977 return s[:length - len(ELLIPSES)] + ELLIPSES
1981 def version_tuple(v):
1982 return tuple(int(e) for e in re.split(r'[-.]', v))
1985 def is_outdated_version(version, limit, assume_new=True):
1987 return not assume_new
1989 return version_tuple(version) < version_tuple(limit)
1991 return not assume_new
1994 def ytdl_is_updateable():
1995 """ Returns if youtube-dl can be updated with -U """
1996 from zipimport import zipimporter
1998 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2001 def args_to_str(args):
2002 # Get a short string representation for a subprocess command
2003 return ' '.join(compat_shlex_quote(a) for a in args)
2006 def error_to_compat_str(err):
2008 # On python 2 error byte string must be decoded with proper
2009 # encoding rather than ascii
2010 if sys.version_info[0] < 3:
2011 err_str = err_str.decode(preferredencoding())
2015 def mimetype2ext(mt):
2025 _, _, res = mt.rpartition('/')
2029 'smptett+xml': 'tt',
2035 'x-mp4-fragmented': 'mp4',
2040 def urlhandle_detect_ext(url_handle):
2041 getheader = url_handle.headers.get
2043 cd = getheader('Content-Disposition')
2045 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2047 e = determine_ext(m.group('filename'), default_ext=None)
2051 return mimetype2ext(getheader('Content-Type'))
2054 def encode_data_uri(data, mime_type):
2055 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2058 def age_restricted(content_limit, age_limit):
2059 """ Returns True iff the content should be blocked """
2061 if age_limit is None: # No limit set
2063 if content_limit is None:
2064 return False # Content available for everyone
2065 return age_limit < content_limit
2068 def is_html(first_bytes):
2069 """ Detect whether a file contains HTML by examining its first bytes. """
2072 (b'\xef\xbb\xbf', 'utf-8'),
2073 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2074 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2075 (b'\xff\xfe', 'utf-16-le'),
2076 (b'\xfe\xff', 'utf-16-be'),
2078 for bom, enc in BOMS:
2079 if first_bytes.startswith(bom):
2080 s = first_bytes[len(bom):].decode(enc, 'replace')
2083 s = first_bytes.decode('utf-8', 'replace')
2085 return re.match(r'^\s*<', s)
2088 def determine_protocol(info_dict):
2089 protocol = info_dict.get('protocol')
2090 if protocol is not None:
2093 url = info_dict['url']
2094 if url.startswith('rtmp'):
2096 elif url.startswith('mms'):
2098 elif url.startswith('rtsp'):
2101 ext = determine_ext(url)
2107 return compat_urllib_parse_urlparse(url).scheme
2110 def render_table(header_row, data):
2111 """ Render a list of rows, each as a list of values """
2112 table = [header_row] + data
2113 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2114 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2115 return '\n'.join(format_str % tuple(row) for row in table)
2118 def _match_one(filter_part, dct):
2119 COMPARISON_OPERATORS = {
2127 operator_rex = re.compile(r'''(?x)\s*
2129 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2131 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2132 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2135 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2136 m = operator_rex.search(filter_part)
2138 op = COMPARISON_OPERATORS[m.group('op')]
2139 if m.group('strval') is not None:
2140 if m.group('op') not in ('=', '!='):
2142 'Operator %s does not support string values!' % m.group('op'))
2143 comparison_value = m.group('strval')
2146 comparison_value = int(m.group('intval'))
2148 comparison_value = parse_filesize(m.group('intval'))
2149 if comparison_value is None:
2150 comparison_value = parse_filesize(m.group('intval') + 'B')
2151 if comparison_value is None:
2153 'Invalid integer value %r in filter part %r' % (
2154 m.group('intval'), filter_part))
2155 actual_value = dct.get(m.group('key'))
2156 if actual_value is None:
2157 return m.group('none_inclusive')
2158 return op(actual_value, comparison_value)
2161 '': lambda v: v is not None,
2162 '!': lambda v: v is None,
2164 operator_rex = re.compile(r'''(?x)\s*
2165 (?P<op>%s)\s*(?P<key>[a-z_]+)
2167 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2168 m = operator_rex.search(filter_part)
2170 op = UNARY_OPERATORS[m.group('op')]
2171 actual_value = dct.get(m.group('key'))
2172 return op(actual_value)
2174 raise ValueError('Invalid filter part %r' % filter_part)
2177 def match_str(filter_str, dct):
2178 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2181 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2184 def match_filter_func(filter_str):
2185 def _match_func(info_dict):
2186 if match_str(filter_str, info_dict):
2189 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2190 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2194 def parse_dfxp_time_expr(time_expr):
2198 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2200 return float(mobj.group('time_offset'))
2202 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2204 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2207 def srt_subtitles_timecode(seconds):
2208 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2211 def dfxp2srt(dfxp_data):
2212 _x = functools.partial(xpath_with_ns, ns_map={
2213 'ttml': 'http://www.w3.org/ns/ttml',
2214 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2215 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2218 class TTMLPElementParser(object):
2221 def start(self, tag, attrib):
2222 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2228 def data(self, data):
2232 return self.out.strip()
2234 def parse_node(node):
2235 target = TTMLPElementParser()
2236 parser = xml.etree.ElementTree.XMLParser(target=target)
2237 parser.feed(xml.etree.ElementTree.tostring(node))
2238 return parser.close()
2240 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2242 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2245 raise ValueError('Invalid dfxp/TTML subtitle')
2247 for para, index in zip(paras, itertools.count(1)):
2248 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2249 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2250 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2251 if begin_time is None:
2256 end_time = begin_time + dur
2257 out.append('%d\n%s --> %s\n%s\n\n' % (
2259 srt_subtitles_timecode(begin_time),
2260 srt_subtitles_timecode(end_time),
2266 def cli_option(params, command_option, param):
2267 param = params.get(param)
2268 return [command_option, param] if param is not None else []
2271 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2272 param = params.get(param)
2273 assert isinstance(param, bool)
2275 return [command_option + separator + (true_value if param else false_value)]
2276 return [command_option, true_value if param else false_value]
2279 def cli_valueless_option(params, command_option, param, expected_value=True):
2280 param = params.get(param)
2281 return [command_option] if param == expected_value else []
2284 def cli_configuration_args(params, param, default=[]):
2285 ex_args = params.get(param)
2288 assert isinstance(ex_args, list)
2292 class ISO639Utils(object):
2293 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2482 def short2long(cls, code):
2483 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2484 return cls._lang_map.get(code[:2])
2487 def long2short(cls, code):
2488 """Convert language code from ISO 639-2/T to ISO 639-1"""
2489 for short_name, long_name in cls._lang_map.items():
2490 if long_name == code:
2494 class ISO3166Utils(object):
2495 # From http://data.okfn.org/data/core/country-list
2497 'AF': 'Afghanistan',
2498 'AX': 'Åland Islands',
2501 'AS': 'American Samoa',
2506 'AG': 'Antigua and Barbuda',
2523 'BO': 'Bolivia, Plurinational State of',
2524 'BQ': 'Bonaire, Sint Eustatius and Saba',
2525 'BA': 'Bosnia and Herzegovina',
2527 'BV': 'Bouvet Island',
2529 'IO': 'British Indian Ocean Territory',
2530 'BN': 'Brunei Darussalam',
2532 'BF': 'Burkina Faso',
2538 'KY': 'Cayman Islands',
2539 'CF': 'Central African Republic',
2543 'CX': 'Christmas Island',
2544 'CC': 'Cocos (Keeling) Islands',
2548 'CD': 'Congo, the Democratic Republic of the',
2549 'CK': 'Cook Islands',
2551 'CI': 'Côte d\'Ivoire',
2556 'CZ': 'Czech Republic',
2560 'DO': 'Dominican Republic',
2563 'SV': 'El Salvador',
2564 'GQ': 'Equatorial Guinea',
2568 'FK': 'Falkland Islands (Malvinas)',
2569 'FO': 'Faroe Islands',
2573 'GF': 'French Guiana',
2574 'PF': 'French Polynesia',
2575 'TF': 'French Southern Territories',
2590 'GW': 'Guinea-Bissau',
2593 'HM': 'Heard Island and McDonald Islands',
2594 'VA': 'Holy See (Vatican City State)',
2601 'IR': 'Iran, Islamic Republic of',
2604 'IM': 'Isle of Man',
2614 'KP': 'Korea, Democratic People\'s Republic of',
2615 'KR': 'Korea, Republic of',
2618 'LA': 'Lao People\'s Democratic Republic',
2624 'LI': 'Liechtenstein',
2628 'MK': 'Macedonia, the Former Yugoslav Republic of',
2635 'MH': 'Marshall Islands',
2641 'FM': 'Micronesia, Federated States of',
2642 'MD': 'Moldova, Republic of',
2653 'NL': 'Netherlands',
2654 'NC': 'New Caledonia',
2655 'NZ': 'New Zealand',
2660 'NF': 'Norfolk Island',
2661 'MP': 'Northern Mariana Islands',
2666 'PS': 'Palestine, State of',
2668 'PG': 'Papua New Guinea',
2671 'PH': 'Philippines',
2675 'PR': 'Puerto Rico',
2679 'RU': 'Russian Federation',
2681 'BL': 'Saint Barthélemy',
2682 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2683 'KN': 'Saint Kitts and Nevis',
2684 'LC': 'Saint Lucia',
2685 'MF': 'Saint Martin (French part)',
2686 'PM': 'Saint Pierre and Miquelon',
2687 'VC': 'Saint Vincent and the Grenadines',
2690 'ST': 'Sao Tome and Principe',
2691 'SA': 'Saudi Arabia',
2695 'SL': 'Sierra Leone',
2697 'SX': 'Sint Maarten (Dutch part)',
2700 'SB': 'Solomon Islands',
2702 'ZA': 'South Africa',
2703 'GS': 'South Georgia and the South Sandwich Islands',
2704 'SS': 'South Sudan',
2709 'SJ': 'Svalbard and Jan Mayen',
2712 'CH': 'Switzerland',
2713 'SY': 'Syrian Arab Republic',
2714 'TW': 'Taiwan, Province of China',
2716 'TZ': 'Tanzania, United Republic of',
2718 'TL': 'Timor-Leste',
2722 'TT': 'Trinidad and Tobago',
2725 'TM': 'Turkmenistan',
2726 'TC': 'Turks and Caicos Islands',
2730 'AE': 'United Arab Emirates',
2731 'GB': 'United Kingdom',
2732 'US': 'United States',
2733 'UM': 'United States Minor Outlying Islands',
2737 'VE': 'Venezuela, Bolivarian Republic of',
2739 'VG': 'Virgin Islands, British',
2740 'VI': 'Virgin Islands, U.S.',
2741 'WF': 'Wallis and Futuna',
2742 'EH': 'Western Sahara',
2749 def short2full(cls, code):
2750 """Convert an ISO 3166-2 country code to the corresponding full name"""
2751 return cls._country_map.get(code.upper())
2754 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2755 def __init__(self, proxies=None):
2756 # Set default handlers
2757 for type in ('http', 'https'):
2758 setattr(self, '%s_open' % type,
2759 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2760 meth(r, proxy, type))
2761 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2763 def proxy_open(self, req, proxy, type):
2764 req_proxy = req.headers.get('Ytdl-request-proxy')
2765 if req_proxy is not None:
2767 del req.headers['Ytdl-request-proxy']
2769 if proxy == '__noproxy__':
2770 return None # No Proxy
2771 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2772 req.add_header('Ytdl-socks-proxy', proxy)
2773 # youtube-dl's http/https handlers do wrapping the socket with socks
2775 return compat_urllib_request.ProxyHandler.proxy_open(
2776 self, req, proxy, type)
2779 def ohdave_rsa_encrypt(data, exponent, modulus):
2781 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2784 data: data to encrypt, bytes-like object
2785 exponent, modulus: parameter e and N of RSA algorithm, both integer
2786 Output: hex string of encrypted data
2788 Limitation: supports one block encryption only
2791 payload = int(binascii.hexlify(data[::-1]), 16)
2792 encrypted = pow(payload, exponent, modulus)
2793 return '%x' % encrypted
2796 def encode_base_n(num, n, table=None):
2797 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2799 table = FULL_TABLE[:n]
2802 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2809 ret = table[num % n] + ret
2814 def decode_packed_codes(code):
2816 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2818 obfucasted_code, base, count, symbols = mobj.groups()
2821 symbols = symbols.split('|')
2826 base_n_count = encode_base_n(count, base)
2827 symbol_table[base_n_count] = symbols[count] or base_n_count
2830 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],