4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
38 compat_HTMLParseError,
43 compat_ctypes_WINFUNCTYPE,
44 compat_etree_fromstring,
47 compat_html_entities_html5,
59 compat_urllib_parse_urlencode,
60 compat_urllib_parse_urlparse,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_request,
73 def register_socks_protocols():
74 # "Register" SOCKS protocols
75 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
76 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
77 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
78 if scheme not in compat_urlparse.uses_netloc:
79 compat_urlparse.uses_netloc.append(scheme)
82 # This is not clearly defined otherwise
83 compiled_regex_type = type(re.compile(''))
86 def random_user_agent():
87 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1666 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
1670 'User-Agent': random_user_agent(),
1671 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1672 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1673 'Accept-Encoding': 'gzip, deflate',
1674 'Accept-Language': 'en-us,en;q=0.5',
1679 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1683 NO_DEFAULT = object()
1685 ENGLISH_MONTH_NAMES = [
1686 'January', 'February', 'March', 'April', 'May', 'June',
1687 'July', 'August', 'September', 'October', 'November', 'December']
1690 'en': ENGLISH_MONTH_NAMES,
1692 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1693 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1696 KNOWN_EXTENSIONS = (
1697 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1698 'flv', 'f4v', 'f4a', 'f4b',
1699 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1700 'mkv', 'mka', 'mk3d',
1703 'asf', 'wmv', 'wma',
1709 'f4f', 'f4m', 'm3u8', 'smil')
1711 # needed for sanitizing filenames in restricted mode
1712 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1713 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1714 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1737 '%Y/%m/%d %H:%M:%S',
1739 '%Y-%m-%d %H:%M:%S',
1740 '%Y-%m-%d %H:%M:%S.%f',
1743 '%Y-%m-%dT%H:%M:%SZ',
1744 '%Y-%m-%dT%H:%M:%S.%fZ',
1745 '%Y-%m-%dT%H:%M:%S.%f0Z',
1746 '%Y-%m-%dT%H:%M:%S',
1747 '%Y-%m-%dT%H:%M:%S.%f',
1749 '%b %d %Y at %H:%M',
1750 '%b %d %Y at %H:%M:%S',
1751 '%B %d %Y at %H:%M',
1752 '%B %d %Y at %H:%M:%S',
1755 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
1756 DATE_FORMATS_DAY_FIRST.extend([
1762 '%d/%m/%Y %H:%M:%S',
1765 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
1766 DATE_FORMATS_MONTH_FIRST.extend([
1771 '%m/%d/%Y %H:%M:%S',
1774 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1775 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
1778 def preferredencoding():
1779 """Get preferred encoding.
1781 Returns the best encoding scheme for the system, based on
1782 locale.getpreferredencoding() and some further tweaks.
1785 pref = locale.getpreferredencoding()
1793 def write_json_file(obj, fn):
1794 """ Encode obj as JSON and write it to fn, atomically if possible """
1796 fn = encodeFilename(fn)
1797 if sys.version_info < (3, 0) and sys.platform != 'win32':
1798 encoding = get_filesystem_encoding()
1799 # os.path.basename returns a bytes object, but NamedTemporaryFile
1800 # will fail if the filename contains non ascii characters unless we
1801 # use a unicode object
1802 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1803 # the same for os.path.dirname
1804 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1806 path_basename = os.path.basename
1807 path_dirname = os.path.dirname
1811 'prefix': path_basename(fn) + '.',
1812 'dir': path_dirname(fn),
1816 # In Python 2.x, json.dump expects a bytestream.
1817 # In Python 3.x, it writes to a character stream
1818 if sys.version_info < (3, 0):
1823 'encoding': 'utf-8',
1826 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1831 if sys.platform == 'win32':
1832 # Need to remove existing file on Windows, else os.rename raises
1833 # WindowsError or FileExistsError.
1838 os.rename(tf.name, fn)
1847 if sys.version_info >= (2, 7):
1848 def find_xpath_attr(node, xpath, key, val=None):
1849 """ Find the xpath xpath[@key=val] """
1850 assert re.match(r'^[a-zA-Z_-]+$', key)
1851 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1852 return node.find(expr)
1854 def find_xpath_attr(node, xpath, key, val=None):
1855 for f in node.findall(compat_xpath(xpath)):
1856 if key not in f.attrib:
1858 if val is None or f.attrib.get(key) == val:
1862 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1863 # the namespace parameter
1866 def xpath_with_ns(path, ns_map):
1867 components = [c.split(':') for c in path.split('/')]
1869 for c in components:
1871 replaced.append(c[0])
1874 replaced.append('{%s}%s' % (ns_map[ns], tag))
1875 return '/'.join(replaced)
1878 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1879 def _find_xpath(xpath):
1880 return node.find(compat_xpath(xpath))
1882 if isinstance(xpath, (str, compat_str)):
1883 n = _find_xpath(xpath)
1891 if default is not NO_DEFAULT:
1894 name = xpath if name is None else name
1895 raise ExtractorError('Could not find XML element %s' % name)
1901 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1902 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
1903 if n is None or n == default:
1906 if default is not NO_DEFAULT:
1909 name = xpath if name is None else name
1910 raise ExtractorError('Could not find XML element\'s text %s' % name)
1916 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
1917 n = find_xpath_attr(node, xpath, key)
1919 if default is not NO_DEFAULT:
1922 name = '%s[@%s]' % (xpath, key) if name is None else name
1923 raise ExtractorError('Could not find XML attribute %s' % name)
1926 return n.attrib[key]
1929 def get_element_by_id(id, html):
1930 """Return the content of the tag with the specified ID in the passed HTML document"""
1931 return get_element_by_attribute('id', id, html)
1934 def get_element_by_class(class_name, html):
1935 """Return the content of the first tag with the specified class in the passed HTML document"""
1936 retval = get_elements_by_class(class_name, html)
1937 return retval[0] if retval else None
1940 def get_element_by_attribute(attribute, value, html, escape_value=True):
1941 retval = get_elements_by_attribute(attribute, value, html, escape_value)
1942 return retval[0] if retval else None
1945 def get_elements_by_class(class_name, html):
1946 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1947 return get_elements_by_attribute(
1948 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1949 html, escape_value=False)
1952 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1953 """Return the content of the tag with the specified attribute in the passed HTML document"""
1955 value = re.escape(value) if escape_value else value
1958 for m in re.finditer(r'''(?xs)
1960 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1962 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1966 ''' % (re.escape(attribute), value), html):
1967 res = m.group('content')
1969 if res.startswith('"') or res.startswith("'"):
1972 retlist.append(unescapeHTML(res))
1977 class HTMLAttributeParser(compat_HTMLParser):
1978 """Trivial HTML parser to gather the attributes for a single element"""
1981 compat_HTMLParser.__init__(self)
1983 def handle_starttag(self, tag, attrs):
1984 self.attrs = dict(attrs)
1987 def extract_attributes(html_element):
1988 """Given a string for an HTML element such as
1990 a="foo" B="bar" c="&98;az" d=boz
1991 empty= noval entity="&"
1994 Decode and return a dictionary of attributes.
1996 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
1997 'empty': '', 'noval': None, 'entity': '&',
1998 'sq': '"', 'dq': '\''
2000 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2001 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2003 parser = HTMLAttributeParser()
2005 parser.feed(html_element)
2007 # Older Python may throw HTMLParseError in case of malformed HTML
2008 except compat_HTMLParseError:
2013 def clean_html(html):
2014 """Clean an HTML snippet into a readable string"""
2016 if html is None: # Convenience for sanitizing descriptions etc.
2020 html = html.replace('\n', ' ')
2021 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2022 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2024 html = re.sub('<.*?>', '', html)
2025 # Replace html entities
2026 html = unescapeHTML(html)
2030 def sanitize_open(filename, open_mode):
2031 """Try to open the given filename, and slightly tweak it if this fails.
2033 Attempts to open the given filename. If this fails, it tries to change
2034 the filename slightly, step by step, until it's either able to open it
2035 or it fails and raises a final exception, like the standard open()
2038 It returns the tuple (stream, definitive_file_name).
2042 if sys.platform == 'win32':
2044 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2045 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2046 stream = open(encodeFilename(filename), open_mode)
2047 return (stream, filename)
2048 except (IOError, OSError) as err:
2049 if err.errno in (errno.EACCES,):
2052 # In case of error, try to remove win32 forbidden chars
2053 alt_filename = sanitize_path(filename)
2054 if alt_filename == filename:
2057 # An exception here should be caught in the caller
2058 stream = open(encodeFilename(alt_filename), open_mode)
2059 return (stream, alt_filename)
2062 def timeconvert(timestr):
2063 """Convert RFC 2822 defined time string into system timestamp"""
2065 timetuple = email.utils.parsedate_tz(timestr)
2066 if timetuple is not None:
2067 timestamp = email.utils.mktime_tz(timetuple)
2071 def sanitize_filename(s, restricted=False, is_id=False):
2072 """Sanitizes a string so it could be used as part of a filename.
2073 If restricted is set, use a stricter subset of allowed characters.
2074 Set is_id if this is not an arbitrary string, but an ID that should be kept
2077 def replace_insane(char):
2078 if restricted and char in ACCENT_CHARS:
2079 return ACCENT_CHARS[char]
2080 if char == '?' or ord(char) < 32 or ord(char) == 127:
2083 return '' if restricted else '\''
2085 return '_-' if restricted else ' -'
2086 elif char in '\\/|*<>':
2088 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
2090 if restricted and ord(char) > 127:
2095 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
2096 result = ''.join(map(replace_insane, s))
2098 while '__' in result:
2099 result = result.replace('__', '_')
2100 result = result.strip('_')
2101 # Common case of "Foreign band name - English song title"
2102 if restricted and result.startswith('-_'):
2104 if result.startswith('-'):
2105 result = '_' + result[len('-'):]
2106 result = result.lstrip('.')
2112 def sanitize_path(s):
2113 """Sanitizes and normalizes path on Windows"""
2114 if sys.platform != 'win32':
2116 drive_or_unc, _ = os.path.splitdrive(s)
2117 if sys.version_info < (2, 7) and not drive_or_unc:
2118 drive_or_unc, _ = os.path.splitunc(s)
2119 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
2123 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
2124 for path_part in norm_path]
2126 sanitized_path.insert(0, drive_or_unc + os.path.sep)
2127 return os.path.join(*sanitized_path)
2130 def sanitize_url(url):
2131 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2132 # the number of unwanted failures due to missing protocol
2133 if url.startswith('//'):
2134 return 'http:%s' % url
2135 # Fix some common typos seen so far
2137 # https://github.com/ytdl-org/youtube-dl/issues/15649
2138 (r'^httpss://', r'https://'),
2139 # https://bx1.be/lives/direct-tv/
2140 (r'^rmtp([es]?)://', r'rtmp\1://'),
2142 for mistake, fixup in COMMON_TYPOS:
2143 if re.match(mistake, url):
2144 return re.sub(mistake, fixup, url)
2148 def sanitized_Request(url, *args, **kwargs):
2149 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
2153 """Expand shell variables and ~"""
2154 return os.path.expandvars(compat_expanduser(s))
2157 def orderedSet(iterable):
2158 """ Remove all duplicates from the input iterable """
2166 def _htmlentity_transform(entity_with_semicolon):
2167 """Transforms an HTML entity to a character."""
2168 entity = entity_with_semicolon[:-1]
2170 # Known non-numeric HTML entity
2171 if entity in compat_html_entities.name2codepoint:
2172 return compat_chr(compat_html_entities.name2codepoint[entity])
2174 # TODO: HTML5 allows entities without a semicolon. For example,
2175 # 'Éric' should be decoded as 'Éric'.
2176 if entity_with_semicolon in compat_html_entities_html5:
2177 return compat_html_entities_html5[entity_with_semicolon]
2179 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
2180 if mobj is not None:
2181 numstr = mobj.group(1)
2182 if numstr.startswith('x'):
2184 numstr = '0%s' % numstr
2187 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2189 return compat_chr(int(numstr, base))
2193 # Unknown entity in name, return its literal representation
2194 return '&%s;' % entity
2197 def unescapeHTML(s):
2200 assert type(s) == compat_str
2203 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
2206 def get_subprocess_encoding():
2207 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2208 # For subprocess calls, encode with locale encoding
2209 # Refer to http://stackoverflow.com/a/9951851/35070
2210 encoding = preferredencoding()
2212 encoding = sys.getfilesystemencoding()
2213 if encoding is None:
2218 def encodeFilename(s, for_subprocess=False):
2220 @param s The name of the file
2223 assert type(s) == compat_str
2225 # Python 3 has a Unicode API
2226 if sys.version_info >= (3, 0):
2229 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2230 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2231 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2232 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2235 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2236 if sys.platform.startswith('java'):
2239 return s.encode(get_subprocess_encoding(), 'ignore')
2242 def decodeFilename(b, for_subprocess=False):
2244 if sys.version_info >= (3, 0):
2247 if not isinstance(b, bytes):
2250 return b.decode(get_subprocess_encoding(), 'ignore')
2253 def encodeArgument(s):
2254 if not isinstance(s, compat_str):
2255 # Legacy code that uses byte strings
2256 # Uncomment the following line after fixing all post processors
2257 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2258 s = s.decode('ascii')
2259 return encodeFilename(s, True)
2262 def decodeArgument(b):
2263 return decodeFilename(b, True)
2266 def decodeOption(optval):
2269 if isinstance(optval, bytes):
2270 optval = optval.decode(preferredencoding())
2272 assert isinstance(optval, compat_str)
2276 def formatSeconds(secs):
2278 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
2280 return '%d:%02d' % (secs // 60, secs % 60)
2285 def make_HTTPS_handler(params, **kwargs):
2286 opts_no_check_certificate = params.get('nocheckcertificate', False)
2287 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
2288 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
2289 if opts_no_check_certificate:
2290 context.check_hostname = False
2291 context.verify_mode = ssl.CERT_NONE
2293 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2296 # (create_default_context present but HTTPSHandler has no context=)
2299 if sys.version_info < (3, 2):
2300 return YoutubeDLHTTPSHandler(params, **kwargs)
2301 else: # Python < 3.4
2302 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
2303 context.verify_mode = (ssl.CERT_NONE
2304 if opts_no_check_certificate
2305 else ssl.CERT_REQUIRED)
2306 context.set_default_verify_paths()
2307 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2310 def bug_reports_message():
2311 if ytdl_is_updateable():
2312 update_cmd = 'type youtube-dl -U to update'
2314 update_cmd = 'see https://yt-dl.org/update on how to update'
2315 msg = '; please report this issue on https://yt-dl.org/bug .'
2316 msg += ' Make sure you are using the latest version; %s.' % update_cmd
2317 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2321 class YoutubeDLError(Exception):
2322 """Base exception for YoutubeDL errors."""
2326 class ExtractorError(YoutubeDLError):
2327 """Error during info extraction."""
2329 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
2330 """ tb, if given, is the original traceback (so that it can be printed out).
2331 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2334 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
2336 if video_id is not None:
2337 msg = video_id + ': ' + msg
2339 msg += ' (caused by %r)' % cause
2341 msg += bug_reports_message()
2342 super(ExtractorError, self).__init__(msg)
2345 self.exc_info = sys.exc_info() # preserve original exception
2347 self.video_id = video_id
2349 def format_traceback(self):
2350 if self.traceback is None:
2352 return ''.join(traceback.format_tb(self.traceback))
2355 class UnsupportedError(ExtractorError):
2356 def __init__(self, url):
2357 super(UnsupportedError, self).__init__(
2358 'Unsupported URL: %s' % url, expected=True)
2362 class RegexNotFoundError(ExtractorError):
2363 """Error when a regex didn't match"""
2367 class GeoRestrictedError(ExtractorError):
2368 """Geographic restriction Error exception.
2370 This exception may be thrown when a video is not available from your
2371 geographic location due to geographic restrictions imposed by a website.
2373 def __init__(self, msg, countries=None):
2374 super(GeoRestrictedError, self).__init__(msg, expected=True)
2376 self.countries = countries
2379 class DownloadError(YoutubeDLError):
2380 """Download Error exception.
2382 This exception may be thrown by FileDownloader objects if they are not
2383 configured to continue on errors. They will contain the appropriate
2387 def __init__(self, msg, exc_info=None):
2388 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2389 super(DownloadError, self).__init__(msg)
2390 self.exc_info = exc_info
2393 class SameFileError(YoutubeDLError):
2394 """Same File exception.
2396 This exception will be thrown by FileDownloader objects if they detect
2397 multiple files would have to be downloaded to the same file on disk.
2402 class PostProcessingError(YoutubeDLError):
2403 """Post Processing exception.
2405 This exception may be raised by PostProcessor's .run() method to
2406 indicate an error in the postprocessing task.
2409 def __init__(self, msg):
2410 super(PostProcessingError, self).__init__(msg)
2414 class MaxDownloadsReached(YoutubeDLError):
2415 """ --max-downloads limit has been reached. """
2419 class UnavailableVideoError(YoutubeDLError):
2420 """Unavailable Format exception.
2422 This exception will be thrown when a video is requested
2423 in a format that is not available for that video.
2428 class ContentTooShortError(YoutubeDLError):
2429 """Content Too Short exception.
2431 This exception may be raised by FileDownloader objects when a file they
2432 download is too small for what the server announced first, indicating
2433 the connection was probably interrupted.
2436 def __init__(self, downloaded, expected):
2437 super(ContentTooShortError, self).__init__(
2438 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
2441 self.downloaded = downloaded
2442 self.expected = expected
2445 class XAttrMetadataError(YoutubeDLError):
2446 def __init__(self, code=None, msg='Unknown error'):
2447 super(XAttrMetadataError, self).__init__(msg)
2451 # Parsing code and msg
2452 if (self.code in (errno.ENOSPC, errno.EDQUOT)
2453 or 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
2454 self.reason = 'NO_SPACE'
2455 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
2456 self.reason = 'VALUE_TOO_LONG'
2458 self.reason = 'NOT_SUPPORTED'
2461 class XAttrUnavailableError(YoutubeDLError):
2465 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
2466 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2467 # expected HTTP responses to meet HTTP/1.0 or later (see also
2468 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2469 if sys.version_info < (3, 0):
2470 kwargs['strict'] = True
2471 hc = http_class(*args, **compat_kwargs(kwargs))
2472 source_address = ydl_handler._params.get('source_address')
2474 if source_address is not None:
2475 # This is to workaround _create_connection() from socket where it will try all
2476 # address data from getaddrinfo() including IPv6. This filters the result from
2477 # getaddrinfo() based on the source_address value.
2478 # This is based on the cpython socket.create_connection() function.
2479 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2480 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
2481 host, port = address
2483 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
2484 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
2485 ip_addrs = [addr for addr in addrs if addr[0] == af]
2486 if addrs and not ip_addrs:
2487 ip_version = 'v4' if af == socket.AF_INET else 'v6'
2489 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2490 % (ip_version, source_address[0]))
2491 for res in ip_addrs:
2492 af, socktype, proto, canonname, sa = res
2495 sock = socket.socket(af, socktype, proto)
2496 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
2497 sock.settimeout(timeout)
2498 sock.bind(source_address)
2500 err = None # Explicitly break reference cycle
2502 except socket.error as _:
2504 if sock is not None:
2509 raise socket.error('getaddrinfo returns an empty list')
2510 if hasattr(hc, '_create_connection'):
2511 hc._create_connection = _create_connection
2512 sa = (source_address, 0)
2513 if hasattr(hc, 'source_address'): # Python 2.7+
2514 hc.source_address = sa
2516 def _hc_connect(self, *args, **kwargs):
2517 sock = _create_connection(
2518 (self.host, self.port), self.timeout, sa)
2520 self.sock = ssl.wrap_socket(
2521 sock, self.key_file, self.cert_file,
2522 ssl_version=ssl.PROTOCOL_TLSv1)
2525 hc.connect = functools.partial(_hc_connect, hc)
2530 def handle_youtubedl_headers(headers):
2531 filtered_headers = headers
2533 if 'Youtubedl-no-compression' in filtered_headers:
2534 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
2535 del filtered_headers['Youtubedl-no-compression']
2537 return filtered_headers
2540 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
2541 """Handler for HTTP requests and responses.
2543 This class, when installed with an OpenerDirector, automatically adds
2544 the standard headers to every HTTP request and handles gzipped and
2545 deflated responses from web servers. If compression is to be avoided in
2546 a particular request, the original request in the program code only has
2547 to include the HTTP header "Youtubedl-no-compression", which will be
2548 removed before making the real request.
2550 Part of this code was copied from:
2552 http://techknack.net/python-urllib2-handlers/
2554 Andrew Rowls, the author of that code, agreed to release it to the
2558 def __init__(self, params, *args, **kwargs):
2559 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
2560 self._params = params
2562 def http_open(self, req):
2563 conn_class = compat_http_client.HTTPConnection
2565 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2567 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2568 del req.headers['Ytdl-socks-proxy']
2570 return self.do_open(functools.partial(
2571 _create_http_connection, self, conn_class, False),
2577 return zlib.decompress(data, -zlib.MAX_WBITS)
2579 return zlib.decompress(data)
2581 def http_request(self, req):
2582 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2583 # always respected by websites, some tend to give out URLs with non percent-encoded
2584 # non-ASCII characters (see telemb.py, ard.py [#3412])
2585 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2586 # To work around aforementioned issue we will replace request's original URL with
2587 # percent-encoded one
2588 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2589 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2590 url = req.get_full_url()
2591 url_escaped = escape_url(url)
2593 # Substitute URL if any change after escaping
2594 if url != url_escaped:
2595 req = update_Request(req, url=url_escaped)
2597 for h, v in std_headers.items():
2598 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2599 # The dict keys are capitalized because of this bug by urllib
2600 if h.capitalize() not in req.headers:
2601 req.add_header(h, v)
2603 req.headers = handle_youtubedl_headers(req.headers)
2605 if sys.version_info < (2, 7) and '#' in req.get_full_url():
2606 # Python 2.6 is brain-dead when it comes to fragments
2607 req._Request__original = req._Request__original.partition('#')[0]
2608 req._Request__r_type = req._Request__r_type.partition('#')[0]
2612 def http_response(self, req, resp):
2615 if resp.headers.get('Content-encoding', '') == 'gzip':
2616 content = resp.read()
2617 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
2619 uncompressed = io.BytesIO(gz.read())
2620 except IOError as original_ioerror:
2621 # There may be junk add the end of the file
2622 # See http://stackoverflow.com/q/4928560/35070 for details
2623 for i in range(1, 1024):
2625 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
2626 uncompressed = io.BytesIO(gz.read())
2631 raise original_ioerror
2632 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
2633 resp.msg = old_resp.msg
2634 del resp.headers['Content-encoding']
2636 if resp.headers.get('Content-encoding', '') == 'deflate':
2637 gz = io.BytesIO(self.deflate(resp.read()))
2638 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
2639 resp.msg = old_resp.msg
2640 del resp.headers['Content-encoding']
2641 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2642 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2643 if 300 <= resp.code < 400:
2644 location = resp.headers.get('Location')
2646 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2647 if sys.version_info >= (3, 0):
2648 location = location.encode('iso-8859-1').decode('utf-8')
2650 location = location.decode('utf-8')
2651 location_escaped = escape_url(location)
2652 if location != location_escaped:
2653 del resp.headers['Location']
2654 if sys.version_info < (3, 0):
2655 location_escaped = location_escaped.encode('utf-8')
2656 resp.headers['Location'] = location_escaped
2659 https_request = http_request
2660 https_response = http_response
2663 def make_socks_conn_class(base_class, socks_proxy):
2664 assert issubclass(base_class, (
2665 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
2667 url_components = compat_urlparse.urlparse(socks_proxy)
2668 if url_components.scheme.lower() == 'socks5':
2669 socks_type = ProxyType.SOCKS5
2670 elif url_components.scheme.lower() in ('socks', 'socks4'):
2671 socks_type = ProxyType.SOCKS4
2672 elif url_components.scheme.lower() == 'socks4a':
2673 socks_type = ProxyType.SOCKS4A
2675 def unquote_if_non_empty(s):
2678 return compat_urllib_parse_unquote_plus(s)
2682 url_components.hostname, url_components.port or 1080,
2684 unquote_if_non_empty(url_components.username),
2685 unquote_if_non_empty(url_components.password),
2688 class SocksConnection(base_class):
2690 self.sock = sockssocket()
2691 self.sock.setproxy(*proxy_args)
2692 if type(self.timeout) in (int, float):
2693 self.sock.settimeout(self.timeout)
2694 self.sock.connect((self.host, self.port))
2696 if isinstance(self, compat_http_client.HTTPSConnection):
2697 if hasattr(self, '_context'): # Python > 2.6
2698 self.sock = self._context.wrap_socket(
2699 self.sock, server_hostname=self.host)
2701 self.sock = ssl.wrap_socket(self.sock)
2703 return SocksConnection
2706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
2707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
2708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
2709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
2710 self._params = params
2712 def https_open(self, req):
2714 conn_class = self._https_conn_class
2716 if hasattr(self, '_context'): # python > 2.6
2717 kwargs['context'] = self._context
2718 if hasattr(self, '_check_hostname'): # python 3.x
2719 kwargs['check_hostname'] = self._check_hostname
2721 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2723 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2724 del req.headers['Ytdl-socks-proxy']
2726 return self.do_open(functools.partial(
2727 _create_http_connection, self, conn_class, True),
2731 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
2732 _HTTPONLY_PREFIX = '#HttpOnly_'
2734 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2735 # Store session cookies with `expires` set to 0 instead of an empty
2738 if cookie.expires is None:
2740 compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
2742 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
2743 """Load cookies from a file."""
2744 if filename is None:
2745 if self.filename is not None:
2746 filename = self.filename
2748 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
2751 with open(filename) as f:
2753 if line.startswith(self._HTTPONLY_PREFIX):
2754 line = line[len(self._HTTPONLY_PREFIX):]
2755 cf.write(compat_str(line))
2757 self._really_load(cf, filename, ignore_discard, ignore_expires)
2758 # Session cookies are denoted by either `expires` field set to
2759 # an empty string or 0. MozillaCookieJar only recognizes the former
2760 # (see [1]). So we need force the latter to be recognized as session
2761 # cookies on our own.
2762 # Session cookies may be important for cookies-based authentication,
2763 # e.g. usually, when user does not check 'Remember me' check box while
2764 # logging in on a site, some important cookies are stored as session
2765 # cookies so that not recognizing them will result in failed login.
2766 # 1. https://bugs.python.org/issue17164
2768 # Treat `expires=0` cookies as session cookies
2769 if cookie.expires == 0:
2770 cookie.expires = None
2771 cookie.discard = True
2774 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
2775 def __init__(self, cookiejar=None):
2776 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
2778 def http_response(self, request, response):
2779 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2780 # characters in Set-Cookie HTTP header of last response (see
2781 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2782 # In order to at least prevent crashing we will percent encode Set-Cookie
2783 # header before HTTPCookieProcessor starts processing it.
2784 # if sys.version_info < (3, 0) and response.headers:
2785 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2786 # set_cookie = response.headers.get(set_cookie_header)
2788 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2789 # if set_cookie != set_cookie_escaped:
2790 # del response.headers[set_cookie_header]
2791 # response.headers[set_cookie_header] = set_cookie_escaped
2792 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
2794 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
2795 https_response = http_response
2798 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
2799 if sys.version_info[0] < 3:
2800 def redirect_request(self, req, fp, code, msg, headers, newurl):
2801 # On python 2 urlh.geturl() may sometimes return redirect URL
2802 # as byte string instead of unicode. This workaround allows
2803 # to force it always return unicode.
2804 return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl))
2807 def extract_timezone(date_str):
2809 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2812 timezone = datetime.timedelta()
2814 date_str = date_str[:-len(m.group('tz'))]
2815 if not m.group('sign'):
2816 timezone = datetime.timedelta()
2818 sign = 1 if m.group('sign') == '+' else -1
2819 timezone = datetime.timedelta(
2820 hours=sign * int(m.group('hours')),
2821 minutes=sign * int(m.group('minutes')))
2822 return timezone, date_str
2825 def parse_iso8601(date_str, delimiter='T', timezone=None):
2826 """ Return a UNIX timestamp from the given date """
2828 if date_str is None:
2831 date_str = re.sub(r'\.[0-9]+', '', date_str)
2833 if timezone is None:
2834 timezone, date_str = extract_timezone(date_str)
2837 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
2838 dt = datetime.datetime.strptime(date_str, date_format) - timezone
2839 return calendar.timegm(dt.timetuple())
2844 def date_formats(day_first=True):
2845 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
2848 def unified_strdate(date_str, day_first=True):
2849 """Return a string with the date in the format YYYYMMDD"""
2851 if date_str is None:
2855 date_str = date_str.replace(',', ' ')
2856 # Remove AM/PM + timezone
2857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
2858 _, date_str = extract_timezone(date_str)
2860 for expression in date_formats(day_first):
2862 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
2865 if upload_date is None:
2866 timetuple = email.utils.parsedate_tz(date_str)
2869 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
2872 if upload_date is not None:
2873 return compat_str(upload_date)
2876 def unified_timestamp(date_str, day_first=True):
2877 if date_str is None:
2880 date_str = re.sub(r'[,|]', '', date_str)
2882 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
2883 timezone, date_str = extract_timezone(date_str)
2885 # Remove AM/PM + timezone
2886 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
2888 # Remove unrecognized timezones from ISO 8601 alike timestamps
2889 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
2891 date_str = date_str[:-len(m.group('tz'))]
2893 # Python only supports microseconds, so remove nanoseconds
2894 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
2896 date_str = m.group(1)
2898 for expression in date_formats(day_first):
2900 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
2901 return calendar.timegm(dt.timetuple())
2904 timetuple = email.utils.parsedate_tz(date_str)
2906 return calendar.timegm(timetuple) + pm_delta * 3600
2909 def determine_ext(url, default_ext='unknown_video'):
2910 if url is None or '.' not in url:
2912 guess = url.partition('?')[0].rpartition('.')[2]
2913 if re.match(r'^[A-Za-z0-9]+$', guess):
2915 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
2916 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
2917 return guess.rstrip('/')
2922 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
2923 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
2926 def date_from_str(date_str):
2928 Return a datetime object from a string in the format YYYYMMDD or
2929 (now|today)[+-][0-9](day|week|month|year)(s)?"""
2930 today = datetime.date.today()
2931 if date_str in ('now', 'today'):
2933 if date_str == 'yesterday':
2934 return today - datetime.timedelta(days=1)
2935 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
2936 if match is not None:
2937 sign = match.group('sign')
2938 time = int(match.group('time'))
2941 unit = match.group('unit')
2942 # A bad approximation?
2946 elif unit == 'year':
2950 delta = datetime.timedelta(**{unit: time})
2951 return today + delta
2952 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
2955 def hyphenate_date(date_str):
2957 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
2958 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
2959 if match is not None:
2960 return '-'.join(match.groups())
2965 class DateRange(object):
2966 """Represents a time interval between two dates"""
2968 def __init__(self, start=None, end=None):
2969 """start and end must be strings in the format accepted by date"""
2970 if start is not None:
2971 self.start = date_from_str(start)
2973 self.start = datetime.datetime.min.date()
2975 self.end = date_from_str(end)
2977 self.end = datetime.datetime.max.date()
2978 if self.start > self.end:
2979 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
2983 """Returns a range that only contains the given day"""
2984 return cls(day, day)
2986 def __contains__(self, date):
2987 """Check if the date is in the range"""
2988 if not isinstance(date, datetime.date):
2989 date = date_from_str(date)
2990 return self.start <= date <= self.end
2993 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
2996 def platform_name():
2997 """ Returns the platform name as a compat_str """
2998 res = platform.platform()
2999 if isinstance(res, bytes):
3000 res = res.decode(preferredencoding())
3002 assert isinstance(res, compat_str)
3006 def _windows_write_string(s, out):
3007 """ Returns True if the string was written using special methods,
3008 False if it has yet to be written out."""
3009 # Adapted from http://stackoverflow.com/a/3259271/35070
3012 import ctypes.wintypes
3020 fileno = out.fileno()
3021 except AttributeError:
3022 # If the output stream doesn't have a fileno, it's virtual
3024 except io.UnsupportedOperation:
3025 # Some strange Windows pseudo files?
3027 if fileno not in WIN_OUTPUT_IDS:
3030 GetStdHandle = compat_ctypes_WINFUNCTYPE(
3031 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
3032 ('GetStdHandle', ctypes.windll.kernel32))
3033 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
3035 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
3036 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
3037 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
3038 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
3039 written = ctypes.wintypes.DWORD(0)
3041 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
3042 FILE_TYPE_CHAR = 0x0002
3043 FILE_TYPE_REMOTE = 0x8000
3044 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
3045 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
3046 ctypes.POINTER(ctypes.wintypes.DWORD))(
3047 ('GetConsoleMode', ctypes.windll.kernel32))
3048 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
3050 def not_a_console(handle):
3051 if handle == INVALID_HANDLE_VALUE or handle is None:
3053 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
3054 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
3056 if not_a_console(h):
3059 def next_nonbmp_pos(s):
3061 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
3062 except StopIteration:
3066 count = min(next_nonbmp_pos(s), 1024)
3068 ret = WriteConsoleW(
3069 h, s, count if count else 2, ctypes.byref(written), None)
3071 raise OSError('Failed to write string')
3072 if not count: # We just wrote a non-BMP character
3073 assert written.value == 2
3076 assert written.value > 0
3077 s = s[written.value:]
3081 def write_string(s, out=None, encoding=None):
3084 assert type(s) == compat_str
3086 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
3087 if _windows_write_string(s, out):
3090 if ('b' in getattr(out, 'mode', '')
3091 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
3092 byt = s.encode(encoding or preferredencoding(), 'ignore')
3094 elif hasattr(out, 'buffer'):
3095 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
3096 byt = s.encode(enc, 'ignore')
3097 out.buffer.write(byt)
3103 def bytes_to_intlist(bs):
3106 if isinstance(bs[0], int): # Python 3
3109 return [ord(c) for c in bs]
3112 def intlist_to_bytes(xs):
3115 return compat_struct_pack('%dB' % len(xs), *xs)
3118 # Cross-platform file locking
3119 if sys.platform == 'win32':
3120 import ctypes.wintypes
3123 class OVERLAPPED(ctypes.Structure):
3125 ('Internal', ctypes.wintypes.LPVOID),
3126 ('InternalHigh', ctypes.wintypes.LPVOID),
3127 ('Offset', ctypes.wintypes.DWORD),
3128 ('OffsetHigh', ctypes.wintypes.DWORD),
3129 ('hEvent', ctypes.wintypes.HANDLE),
3132 kernel32 = ctypes.windll.kernel32
3133 LockFileEx = kernel32.LockFileEx
3134 LockFileEx.argtypes = [
3135 ctypes.wintypes.HANDLE, # hFile
3136 ctypes.wintypes.DWORD, # dwFlags
3137 ctypes.wintypes.DWORD, # dwReserved
3138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3139 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3140 ctypes.POINTER(OVERLAPPED) # Overlapped
3142 LockFileEx.restype = ctypes.wintypes.BOOL
3143 UnlockFileEx = kernel32.UnlockFileEx
3144 UnlockFileEx.argtypes = [
3145 ctypes.wintypes.HANDLE, # hFile
3146 ctypes.wintypes.DWORD, # dwReserved
3147 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3148 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3149 ctypes.POINTER(OVERLAPPED) # Overlapped
3151 UnlockFileEx.restype = ctypes.wintypes.BOOL
3152 whole_low = 0xffffffff
3153 whole_high = 0x7fffffff
3155 def _lock_file(f, exclusive):
3156 overlapped = OVERLAPPED()
3157 overlapped.Offset = 0
3158 overlapped.OffsetHigh = 0
3159 overlapped.hEvent = 0
3160 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
3161 handle = msvcrt.get_osfhandle(f.fileno())
3162 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
3163 whole_low, whole_high, f._lock_file_overlapped_p):
3164 raise OSError('Locking file failed: %r' % ctypes.FormatError())
3166 def _unlock_file(f):
3167 assert f._lock_file_overlapped_p
3168 handle = msvcrt.get_osfhandle(f.fileno())
3169 if not UnlockFileEx(handle, 0,
3170 whole_low, whole_high, f._lock_file_overlapped_p):
3171 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
3174 # Some platforms, such as Jython, is missing fcntl
3178 def _lock_file(f, exclusive):
3179 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
3181 def _unlock_file(f):
3182 fcntl.flock(f, fcntl.LOCK_UN)
3184 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
3186 def _lock_file(f, exclusive):
3187 raise IOError(UNSUPPORTED_MSG)
3189 def _unlock_file(f):
3190 raise IOError(UNSUPPORTED_MSG)
3193 class locked_file(object):
3194 def __init__(self, filename, mode, encoding=None):
3195 assert mode in ['r', 'a', 'w']
3196 self.f = io.open(filename, mode, encoding=encoding)
3199 def __enter__(self):
3200 exclusive = self.mode != 'r'
3202 _lock_file(self.f, exclusive)
3208 def __exit__(self, etype, value, traceback):
3210 _unlock_file(self.f)
3217 def write(self, *args):
3218 return self.f.write(*args)
3220 def read(self, *args):
3221 return self.f.read(*args)
3224 def get_filesystem_encoding():
3225 encoding = sys.getfilesystemencoding()
3226 return encoding if encoding is not None else 'utf-8'
3229 def shell_quote(args):
3231 encoding = get_filesystem_encoding()
3233 if isinstance(a, bytes):
3234 # We may get a filename encoded with 'encodeFilename'
3235 a = a.decode(encoding)
3236 quoted_args.append(compat_shlex_quote(a))
3237 return ' '.join(quoted_args)
3240 def smuggle_url(url, data):
3241 """ Pass additional data in a URL for internal use. """
3243 url, idata = unsmuggle_url(url, {})
3245 sdata = compat_urllib_parse_urlencode(
3246 {'__youtubedl_smuggle': json.dumps(data)})
3247 return url + '#' + sdata
3250 def unsmuggle_url(smug_url, default=None):
3251 if '#__youtubedl_smuggle' not in smug_url:
3252 return smug_url, default
3253 url, _, sdata = smug_url.rpartition('#')
3254 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
3255 data = json.loads(jsond)
3259 def format_bytes(bytes):
3262 if type(bytes) is str:
3263 bytes = float(bytes)
3267 exponent = int(math.log(bytes, 1024.0))
3268 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
3269 converted = float(bytes) / float(1024 ** exponent)
3270 return '%.2f%s' % (converted, suffix)
3273 def lookup_unit_table(unit_table, s):
3274 units_re = '|'.join(re.escape(u) for u in unit_table)
3276 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
3279 num_str = m.group('num').replace(',', '.')
3280 mult = unit_table[m.group('unit')]
3281 return int(float(num_str) * mult)
3284 def parse_filesize(s):
3288 # The lower-case forms are of course incorrect and unofficial,
3289 # but we support those too
3306 'megabytes': 1000 ** 2,
3307 'mebibytes': 1024 ** 2,
3313 'gigabytes': 1000 ** 3,
3314 'gibibytes': 1024 ** 3,
3320 'terabytes': 1000 ** 4,
3321 'tebibytes': 1024 ** 4,
3327 'petabytes': 1000 ** 5,
3328 'pebibytes': 1024 ** 5,
3334 'exabytes': 1000 ** 6,
3335 'exbibytes': 1024 ** 6,
3341 'zettabytes': 1000 ** 7,
3342 'zebibytes': 1024 ** 7,
3348 'yottabytes': 1000 ** 8,
3349 'yobibytes': 1024 ** 8,
3352 return lookup_unit_table(_UNIT_TABLE, s)
3361 if re.match(r'^[\d,.]+$', s):
3362 return str_to_int(s)
3373 return lookup_unit_table(_UNIT_TABLE, s)
3376 def parse_resolution(s):
3380 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
3383 'width': int(mobj.group('w')),
3384 'height': int(mobj.group('h')),
3387 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
3389 return {'height': int(mobj.group(1))}
3391 mobj = re.search(r'\b([48])[kK]\b', s)
3393 return {'height': int(mobj.group(1)) * 540}
3398 def parse_bitrate(s):
3399 if not isinstance(s, compat_str):
3401 mobj = re.search(r'\b(\d+)\s*kbps', s)
3403 return int(mobj.group(1))
3406 def month_by_name(name, lang='en'):
3407 """ Return the number of a month by (locale-independently) English name """
3409 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
3412 return month_names.index(name) + 1
3417 def month_by_abbreviation(abbrev):
3418 """ Return the number of a month by (locale-independently) English
3422 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
3427 def fix_xml_ampersands(xml_str):
3428 """Replace all the '&' by '&' in XML"""
3430 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3435 def setproctitle(title):
3436 assert isinstance(title, compat_str)
3438 # ctypes in Jython is not complete
3439 # http://bugs.jython.org/issue2148
3440 if sys.platform.startswith('java'):
3444 libc = ctypes.cdll.LoadLibrary('libc.so.6')
3448 # LoadLibrary in Windows Python 2.7.13 only expects
3449 # a bytestring, but since unicode_literals turns
3450 # every string into a unicode string, it fails.
3452 title_bytes = title.encode('utf-8')
3453 buf = ctypes.create_string_buffer(len(title_bytes))
3454 buf.value = title_bytes
3456 libc.prctl(15, buf, 0, 0, 0)
3457 except AttributeError:
3458 return # Strange libc, just skip this
3461 def remove_start(s, start):
3462 return s[len(start):] if s is not None and s.startswith(start) else s
3465 def remove_end(s, end):
3466 return s[:-len(end)] if s is not None and s.endswith(end) else s
3469 def remove_quotes(s):
3470 if s is None or len(s) < 2:
3472 for quote in ('"', "'", ):
3473 if s[0] == quote and s[-1] == quote:
3478 def url_basename(url):
3479 path = compat_urlparse.urlparse(url).path
3480 return path.strip('/').split('/')[-1]
3484 return re.match(r'https?://[^?#&]+/', url).group()
3487 def urljoin(base, path):
3488 if isinstance(path, bytes):
3489 path = path.decode('utf-8')
3490 if not isinstance(path, compat_str) or not path:
3492 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
3494 if isinstance(base, bytes):
3495 base = base.decode('utf-8')
3496 if not isinstance(base, compat_str) or not re.match(
3497 r'^(?:https?:)?//', base):
3499 return compat_urlparse.urljoin(base, path)
3502 class HEADRequest(compat_urllib_request.Request):
3503 def get_method(self):
3507 class PUTRequest(compat_urllib_request.Request):
3508 def get_method(self):
3512 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
3515 v = getattr(v, get_attr, None)
3521 return int(v) * invscale // scale
3522 except (ValueError, TypeError):
3526 def str_or_none(v, default=None):
3527 return default if v is None else compat_str(v)
3530 def str_to_int(int_str):
3531 """ A more relaxed version of int_or_none """
3532 if isinstance(int_str, compat_integer_types):
3534 elif isinstance(int_str, compat_str):
3535 int_str = re.sub(r'[,\.\+]', '', int_str)
3536 return int_or_none(int_str)
3539 def float_or_none(v, scale=1, invscale=1, default=None):
3543 return float(v) * invscale / scale
3544 except (ValueError, TypeError):
3548 def bool_or_none(v, default=None):
3549 return v if isinstance(v, bool) else default
3552 def strip_or_none(v, default=None):
3553 return v.strip() if isinstance(v, compat_str) else default
3556 def url_or_none(url):
3557 if not url or not isinstance(url, compat_str):
3560 return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
3563 def parse_duration(s):
3564 if not isinstance(s, compat_basestring):
3569 days, hours, mins, secs, ms = [None] * 5
3570 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
3572 days, hours, mins, secs, ms = m.groups()
3577 [0-9]+\s*y(?:ears?)?\s*
3580 [0-9]+\s*m(?:onths?)?\s*
3583 [0-9]+\s*w(?:eeks?)?\s*
3586 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3590 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3593 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3596 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3599 days, hours, mins, secs, ms = m.groups()
3601 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
3603 hours, mins = m.groups()
3609 duration += float(secs)
3611 duration += float(mins) * 60
3613 duration += float(hours) * 60 * 60
3615 duration += float(days) * 24 * 60 * 60
3617 duration += float(ms)
3621 def prepend_extension(filename, ext, expected_real_ext=None):
3622 name, real_ext = os.path.splitext(filename)
3624 '{0}.{1}{2}'.format(name, ext, real_ext)
3625 if not expected_real_ext or real_ext[1:] == expected_real_ext
3626 else '{0}.{1}'.format(filename, ext))
3629 def replace_extension(filename, ext, expected_real_ext=None):
3630 name, real_ext = os.path.splitext(filename)
3631 return '{0}.{1}'.format(
3632 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
3636 def check_executable(exe, args=[]):
3637 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3638 args can be a list of arguments for a short output (like -version) """
3640 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
3646 def get_exe_version(exe, args=['--version'],
3647 version_re=None, unrecognized='present'):
3648 """ Returns the version of the specified executable,
3649 or False if the executable is not present """
3651 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3652 # SIGTTOU if youtube-dl is run in the background.
3653 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3654 out, _ = subprocess.Popen(
3655 [encodeArgument(exe)] + args,
3656 stdin=subprocess.PIPE,
3657 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
3660 if isinstance(out, bytes): # Python 2.x
3661 out = out.decode('ascii', 'ignore')
3662 return detect_exe_version(out, version_re, unrecognized)
3665 def detect_exe_version(output, version_re=None, unrecognized='present'):
3666 assert isinstance(output, compat_str)
3667 if version_re is None:
3668 version_re = r'version\s+([-0-9._a-zA-Z]+)'
3669 m = re.search(version_re, output)
3676 class PagedList(object):
3678 # This is only useful for tests
3679 return len(self.getslice())
3682 class OnDemandPagedList(PagedList):
3683 def __init__(self, pagefunc, pagesize, use_cache=True):
3684 self._pagefunc = pagefunc
3685 self._pagesize = pagesize
3686 self._use_cache = use_cache
3690 def getslice(self, start=0, end=None):
3692 for pagenum in itertools.count(start // self._pagesize):
3693 firstid = pagenum * self._pagesize
3694 nextfirstid = pagenum * self._pagesize + self._pagesize
3695 if start >= nextfirstid:
3700 page_results = self._cache.get(pagenum)
3701 if page_results is None:
3702 page_results = list(self._pagefunc(pagenum))
3704 self._cache[pagenum] = page_results
3707 start % self._pagesize
3708 if firstid <= start < nextfirstid
3712 ((end - 1) % self._pagesize) + 1
3713 if (end is not None and firstid <= end <= nextfirstid)
3716 if startv != 0 or endv is not None:
3717 page_results = page_results[startv:endv]
3718 res.extend(page_results)
3720 # A little optimization - if current page is not "full", ie. does
3721 # not contain page_size videos then we can assume that this page
3722 # is the last one - there are no more ids on further pages -
3723 # i.e. no need to query again.
3724 if len(page_results) + startv < self._pagesize:
3727 # If we got the whole page, but the next page is not interesting,
3728 # break out early as well
3729 if end == nextfirstid:
3734 class InAdvancePagedList(PagedList):
3735 def __init__(self, pagefunc, pagecount, pagesize):
3736 self._pagefunc = pagefunc
3737 self._pagecount = pagecount
3738 self._pagesize = pagesize
3740 def getslice(self, start=0, end=None):
3742 start_page = start // self._pagesize
3744 self._pagecount if end is None else (end // self._pagesize + 1))
3745 skip_elems = start - start_page * self._pagesize
3746 only_more = None if end is None else end - start
3747 for pagenum in range(start_page, end_page):
3748 page = list(self._pagefunc(pagenum))
3750 page = page[skip_elems:]
3752 if only_more is not None:
3753 if len(page) < only_more:
3754 only_more -= len(page)
3756 page = page[:only_more]
3763 def uppercase_escape(s):
3764 unicode_escape = codecs.getdecoder('unicode_escape')
3766 r'\\U[0-9a-fA-F]{8}',
3767 lambda m: unicode_escape(m.group(0))[0],
3771 def lowercase_escape(s):
3772 unicode_escape = codecs.getdecoder('unicode_escape')
3774 r'\\u[0-9a-fA-F]{4}',
3775 lambda m: unicode_escape(m.group(0))[0],
3779 def escape_rfc3986(s):
3780 """Escape non-ASCII characters as suggested by RFC 3986"""
3781 if sys.version_info < (3, 0) and isinstance(s, compat_str):
3782 s = s.encode('utf-8')
3783 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3786 def escape_url(url):
3787 """Escape URL as suggested by RFC 3986"""
3788 url_parsed = compat_urllib_parse_urlparse(url)
3789 return url_parsed._replace(
3790 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3791 path=escape_rfc3986(url_parsed.path),
3792 params=escape_rfc3986(url_parsed.params),
3793 query=escape_rfc3986(url_parsed.query),
3794 fragment=escape_rfc3986(url_parsed.fragment)
3798 def read_batch_urls(batch_fd):
3800 if not isinstance(url, compat_str):
3801 url = url.decode('utf-8', 'replace')
3802 BOM_UTF8 = '\xef\xbb\xbf'
3803 if url.startswith(BOM_UTF8):
3804 url = url[len(BOM_UTF8):]
3806 if url.startswith(('#', ';', ']')):
3810 with contextlib.closing(batch_fd) as fd:
3811 return [url for url in map(fixup, fd) if url]
3814 def urlencode_postdata(*args, **kargs):
3815 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3818 def update_url_query(url, query):
3821 parsed_url = compat_urlparse.urlparse(url)
3822 qs = compat_parse_qs(parsed_url.query)
3824 return compat_urlparse.urlunparse(parsed_url._replace(
3825 query=compat_urllib_parse_urlencode(qs, True)))
3828 def update_Request(req, url=None, data=None, headers={}, query={}):
3829 req_headers = req.headers.copy()
3830 req_headers.update(headers)
3831 req_data = data or req.data
3832 req_url = update_url_query(url or req.get_full_url(), query)
3833 req_get_method = req.get_method()
3834 if req_get_method == 'HEAD':
3835 req_type = HEADRequest
3836 elif req_get_method == 'PUT':
3837 req_type = PUTRequest
3839 req_type = compat_urllib_request.Request
3841 req_url, data=req_data, headers=req_headers,
3842 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3843 if hasattr(req, 'timeout'):
3844 new_req.timeout = req.timeout
3848 def _multipart_encode_impl(data, boundary):
3849 content_type = 'multipart/form-data; boundary=%s' % boundary
3852 for k, v in data.items():
3853 out += b'--' + boundary.encode('ascii') + b'\r\n'
3854 if isinstance(k, compat_str):
3855 k = k.encode('utf-8')
3856 if isinstance(v, compat_str):
3857 v = v.encode('utf-8')
3858 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3859 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3860 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3861 if boundary.encode('ascii') in content:
3862 raise ValueError('Boundary overlaps with data')
3865 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3867 return out, content_type
3870 def multipart_encode(data, boundary=None):
3872 Encode a dict to RFC 7578-compliant form-data
3875 A dict where keys and values can be either Unicode or bytes-like
3878 If specified a Unicode object, it's used as the boundary. Otherwise
3879 a random boundary is generated.
3881 Reference: https://tools.ietf.org/html/rfc7578
3883 has_specified_boundary = boundary is not None
3886 if boundary is None:
3887 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3890 out, content_type = _multipart_encode_impl(data, boundary)
3893 if has_specified_boundary:
3897 return out, content_type
3900 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3901 if isinstance(key_or_keys, (list, tuple)):
3902 for key in key_or_keys:
3903 if key not in d or d[key] is None or skip_false_values and not d[key]:
3907 return d.get(key_or_keys, default)
3910 def try_get(src, getter, expected_type=None):
3911 if not isinstance(getter, (list, tuple)):
3916 except (AttributeError, KeyError, TypeError, IndexError):
3919 if expected_type is None or isinstance(v, expected_type):
3923 def merge_dicts(*dicts):
3925 for a_dict in dicts:
3926 for k, v in a_dict.items():
3930 or (isinstance(v, compat_str) and v
3931 and isinstance(merged[k], compat_str)
3932 and not merged[k])):
3937 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3938 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3950 TV_PARENTAL_GUIDELINES = {
3960 def parse_age_limit(s):
3962 return s if 0 <= s <= 21 else None
3963 if not isinstance(s, compat_basestring):
3965 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3967 return int(m.group('age'))
3969 return US_RATINGS[s]
3970 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3972 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3976 def strip_jsonp(code):
3979 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3980 (?:\s*&&\s*(?P=func_name))?
3981 \s*\(\s*(?P<callback_data>.*)\);?
3982 \s*?(?://[^\n]*)*$''',
3983 r'\g<callback_data>', code)
3986 def js_to_json(code):
3987 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
3988 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3990 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3991 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3996 if v in ('true', 'false', 'null'):
3998 elif v.startswith('/*') or v.startswith('//') or v == ',':
4001 if v[0] in ("'", '"'):
4002 v = re.sub(r'(?s)\\.|"', lambda m: {
4007 }.get(m.group(0), m.group(0)), v[1:-1])
4009 for regex, base in INTEGER_TABLE:
4010 im = re.match(regex, v)
4012 i = int(im.group(1), base)
4013 return '"%d":' % i if v.endswith(':') else '%d' % i
4017 return re.sub(r'''(?sx)
4018 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4019 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4020 {comment}|,(?={skip}[\]}}])|
4021 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4022 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4024 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
4027 def qualities(quality_ids):
4028 """ Get a numeric quality value out of a list of possible values """
4031 return quality_ids.index(qid)
4037 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
4040 def limit_length(s, length):
4041 """ Add ellipses to overly long strings """
4046 return s[:length - len(ELLIPSES)] + ELLIPSES
4050 def version_tuple(v):
4051 return tuple(int(e) for e in re.split(r'[-.]', v))
4054 def is_outdated_version(version, limit, assume_new=True):
4056 return not assume_new
4058 return version_tuple(version) < version_tuple(limit)
4060 return not assume_new
4063 def ytdl_is_updateable():
4064 """ Returns if youtube-dl can be updated with -U """
4065 from zipimport import zipimporter
4067 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
4070 def args_to_str(args):
4071 # Get a short string representation for a subprocess command
4072 return ' '.join(compat_shlex_quote(a) for a in args)
4075 def error_to_compat_str(err):
4077 # On python 2 error byte string must be decoded with proper
4078 # encoding rather than ascii
4079 if sys.version_info[0] < 3:
4080 err_str = err_str.decode(preferredencoding())
4084 def mimetype2ext(mt):
4090 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4091 # it's the most popular one
4092 'audio/mpeg': 'mp3',
4097 _, _, res = mt.rpartition('/')
4098 res = res.split(';')[0].strip().lower()
4102 'smptett+xml': 'tt',
4106 'x-mp4-fragmented': 'mp4',
4107 'x-ms-sami': 'sami',
4110 'x-mpegurl': 'm3u8',
4111 'vnd.apple.mpegurl': 'm3u8',
4115 'vnd.ms-sstr+xml': 'ism',
4121 def parse_codecs(codecs_str):
4122 # http://tools.ietf.org/html/rfc6381
4125 splited_codecs = list(filter(None, map(
4126 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
4127 vcodec, acodec = None, None
4128 for full_codec in splited_codecs:
4129 codec = full_codec.split('.')[0]
4130 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4133 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4137 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4138 if not vcodec and not acodec:
4139 if len(splited_codecs) == 2:
4141 'vcodec': splited_codecs[0],
4142 'acodec': splited_codecs[1],
4146 'vcodec': vcodec or 'none',
4147 'acodec': acodec or 'none',
4152 def urlhandle_detect_ext(url_handle):
4153 getheader = url_handle.headers.get
4155 cd = getheader('Content-Disposition')
4157 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
4159 e = determine_ext(m.group('filename'), default_ext=None)
4163 return mimetype2ext(getheader('Content-Type'))
4166 def encode_data_uri(data, mime_type):
4167 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
4170 def age_restricted(content_limit, age_limit):
4171 """ Returns True iff the content should be blocked """
4173 if age_limit is None: # No limit set
4175 if content_limit is None:
4176 return False # Content available for everyone
4177 return age_limit < content_limit
4180 def is_html(first_bytes):
4181 """ Detect whether a file contains HTML by examining its first bytes. """
4184 (b'\xef\xbb\xbf', 'utf-8'),
4185 (b'\x00\x00\xfe\xff', 'utf-32-be'),
4186 (b'\xff\xfe\x00\x00', 'utf-32-le'),
4187 (b'\xff\xfe', 'utf-16-le'),
4188 (b'\xfe\xff', 'utf-16-be'),
4190 for bom, enc in BOMS:
4191 if first_bytes.startswith(bom):
4192 s = first_bytes[len(bom):].decode(enc, 'replace')
4195 s = first_bytes.decode('utf-8', 'replace')
4197 return re.match(r'^\s*<', s)
4200 def determine_protocol(info_dict):
4201 protocol = info_dict.get('protocol')
4202 if protocol is not None:
4205 url = info_dict['url']
4206 if url.startswith('rtmp'):
4208 elif url.startswith('mms'):
4210 elif url.startswith('rtsp'):
4213 ext = determine_ext(url)
4219 return compat_urllib_parse_urlparse(url).scheme
4222 def render_table(header_row, data):
4223 """ Render a list of rows, each as a list of values """
4224 table = [header_row] + data
4225 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
4226 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
4227 return '\n'.join(format_str % tuple(row) for row in table)
4230 def _match_one(filter_part, dct):
4231 COMPARISON_OPERATORS = {
4239 operator_rex = re.compile(r'''(?x)\s*
4241 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4243 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4244 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
4245 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
4248 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4249 m = operator_rex.search(filter_part)
4251 op = COMPARISON_OPERATORS[m.group('op')]
4252 actual_value = dct.get(m.group('key'))
4253 if (m.group('quotedstrval') is not None
4254 or m.group('strval') is not None
4255 # If the original field is a string and matching comparisonvalue is
4256 # a number we should respect the origin of the original field
4257 # and process comparison value as a string (see
4258 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4259 or actual_value is not None and m.group('intval') is not None
4260 and isinstance(actual_value, compat_str)):
4261 if m.group('op') not in ('=', '!='):
4263 'Operator %s does not support string values!' % m.group('op'))
4264 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4265 quote = m.group('quote')
4266 if quote is not None:
4267 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4270 comparison_value = int(m.group('intval'))
4272 comparison_value = parse_filesize(m.group('intval'))
4273 if comparison_value is None:
4274 comparison_value = parse_filesize(m.group('intval') + 'B')
4275 if comparison_value is None:
4277 'Invalid integer value %r in filter part %r' % (
4278 m.group('intval'), filter_part))
4279 if actual_value is None:
4280 return m.group('none_inclusive')
4281 return op(actual_value, comparison_value)
4284 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4285 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4287 operator_rex = re.compile(r'''(?x)\s*
4288 (?P<op>%s)\s*(?P<key>[a-z_]+)
4290 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4291 m = operator_rex.search(filter_part)
4293 op = UNARY_OPERATORS[m.group('op')]
4294 actual_value = dct.get(m.group('key'))
4295 return op(actual_value)
4297 raise ValueError('Invalid filter part %r' % filter_part)
4300 def match_str(filter_str, dct):
4301 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4304 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4307 def match_filter_func(filter_str):
4308 def _match_func(info_dict):
4309 if match_str(filter_str, info_dict):
4312 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4313 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4317 def parse_dfxp_time_expr(time_expr):
4321 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4323 return float(mobj.group('time_offset'))
4325 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4327 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4330 def srt_subtitles_timecode(seconds):
4331 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4334 def dfxp2srt(dfxp_data):
4336 @param dfxp_data A bytes-like object containing DFXP data
4337 @returns A unicode object containing converted SRT data
4339 LEGACY_NAMESPACES = (
4340 (b'http://www.w3.org/ns/ttml', [
4341 b'http://www.w3.org/2004/11/ttaf1',
4342 b'http://www.w3.org/2006/04/ttaf1',
4343 b'http://www.w3.org/2006/10/ttaf1',
4345 (b'http://www.w3.org/ns/ttml#styling', [
4346 b'http://www.w3.org/ns/ttml#style',
4350 SUPPORTED_STYLING = [
4359 _x = functools.partial(xpath_with_ns, ns_map={
4360 'xml': 'http://www.w3.org/XML/1998/namespace',
4361 'ttml': 'http://www.w3.org/ns/ttml',
4362 'tts': 'http://www.w3.org/ns/ttml#styling',
4368 class TTMLPElementParser(object):
4370 _unclosed_elements = []
4371 _applied_styles = []
4373 def start(self, tag, attrib):
4374 if tag in (_x('ttml:br'), 'br'):
4377 unclosed_elements = []
4379 element_style_id = attrib.get('style')
4381 style.update(default_style)
4382 if element_style_id:
4383 style.update(styles.get(element_style_id, {}))
4384 for prop in SUPPORTED_STYLING:
4385 prop_val = attrib.get(_x('tts:' + prop))
4387 style[prop] = prop_val
4390 for k, v in sorted(style.items()):
4391 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4394 font += ' color="%s"' % v
4395 elif k == 'fontSize':
4396 font += ' size="%s"' % v
4397 elif k == 'fontFamily':
4398 font += ' face="%s"' % v
4399 elif k == 'fontWeight' and v == 'bold':
4401 unclosed_elements.append('b')
4402 elif k == 'fontStyle' and v == 'italic':
4404 unclosed_elements.append('i')
4405 elif k == 'textDecoration' and v == 'underline':
4407 unclosed_elements.append('u')
4409 self._out += '<font' + font + '>'
4410 unclosed_elements.append('font')
4412 if self._applied_styles:
4413 applied_style.update(self._applied_styles[-1])
4414 applied_style.update(style)
4415 self._applied_styles.append(applied_style)
4416 self._unclosed_elements.append(unclosed_elements)
4419 if tag not in (_x('ttml:br'), 'br'):
4420 unclosed_elements = self._unclosed_elements.pop()
4421 for element in reversed(unclosed_elements):
4422 self._out += '</%s>' % element
4423 if unclosed_elements and self._applied_styles:
4424 self._applied_styles.pop()
4426 def data(self, data):
4430 return self._out.strip()
4432 def parse_node(node):
4433 target = TTMLPElementParser()
4434 parser = xml.etree.ElementTree.XMLParser(target=target)
4435 parser.feed(xml.etree.ElementTree.tostring(node))
4436 return parser.close()
4438 for k, v in LEGACY_NAMESPACES:
4440 dfxp_data = dfxp_data.replace(ns, k)
4442 dfxp = compat_etree_fromstring(dfxp_data)
4444 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4447 raise ValueError('Invalid dfxp/TTML subtitle')
4451 for style in dfxp.findall(_x('.//ttml:style')):
4452 style_id = style.get('id') or style.get(_x('xml:id'))
4455 parent_style_id = style.get('style')
4457 if parent_style_id not in styles:
4460 styles[style_id] = styles[parent_style_id].copy()
4461 for prop in SUPPORTED_STYLING:
4462 prop_val = style.get(_x('tts:' + prop))
4464 styles.setdefault(style_id, {})[prop] = prop_val
4470 for p in ('body', 'div'):
4471 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4474 style = styles.get(ele.get('style'))
4477 default_style.update(style)
4479 for para, index in zip(paras, itertools.count(1)):
4480 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4481 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4482 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4483 if begin_time is None:
4488 end_time = begin_time + dur
4489 out.append('%d\n%s --> %s\n%s\n\n' % (
4491 srt_subtitles_timecode(begin_time),
4492 srt_subtitles_timecode(end_time),
4498 def cli_option(params, command_option, param):
4499 param = params.get(param)
4501 param = compat_str(param)
4502 return [command_option, param] if param is not None else []
4505 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4506 param = params.get(param)
4509 assert isinstance(param, bool)
4511 return [command_option + separator + (true_value if param else false_value)]
4512 return [command_option, true_value if param else false_value]
4515 def cli_valueless_option(params, command_option, param, expected_value=True):
4516 param = params.get(param)
4517 return [command_option] if param == expected_value else []
4520 def cli_configuration_args(params, param, default=[]):
4521 ex_args = params.get(param)
4524 assert isinstance(ex_args, list)
4528 class ISO639Utils(object):
4529 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4588 'iw': 'heb', # Replaced by he in 1989 revision
4598 'in': 'ind', # Replaced by id in 1989 revision
4713 'ji': 'yid', # Replaced by yi in 1989 revision
4721 def short2long(cls, code):
4722 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4723 return cls._lang_map.get(code[:2])
4726 def long2short(cls, code):
4727 """Convert language code from ISO 639-2/T to ISO 639-1"""
4728 for short_name, long_name in cls._lang_map.items():
4729 if long_name == code:
4733 class ISO3166Utils(object):
4734 # From http://data.okfn.org/data/core/country-list
4736 'AF': 'Afghanistan',
4737 'AX': 'Åland Islands',
4740 'AS': 'American Samoa',
4745 'AG': 'Antigua and Barbuda',
4762 'BO': 'Bolivia, Plurinational State of',
4763 'BQ': 'Bonaire, Sint Eustatius and Saba',
4764 'BA': 'Bosnia and Herzegovina',
4766 'BV': 'Bouvet Island',
4768 'IO': 'British Indian Ocean Territory',
4769 'BN': 'Brunei Darussalam',
4771 'BF': 'Burkina Faso',
4777 'KY': 'Cayman Islands',
4778 'CF': 'Central African Republic',
4782 'CX': 'Christmas Island',
4783 'CC': 'Cocos (Keeling) Islands',
4787 'CD': 'Congo, the Democratic Republic of the',
4788 'CK': 'Cook Islands',
4790 'CI': 'Côte d\'Ivoire',
4795 'CZ': 'Czech Republic',
4799 'DO': 'Dominican Republic',
4802 'SV': 'El Salvador',
4803 'GQ': 'Equatorial Guinea',
4807 'FK': 'Falkland Islands (Malvinas)',
4808 'FO': 'Faroe Islands',
4812 'GF': 'French Guiana',
4813 'PF': 'French Polynesia',
4814 'TF': 'French Southern Territories',
4829 'GW': 'Guinea-Bissau',
4832 'HM': 'Heard Island and McDonald Islands',
4833 'VA': 'Holy See (Vatican City State)',
4840 'IR': 'Iran, Islamic Republic of',
4843 'IM': 'Isle of Man',
4853 'KP': 'Korea, Democratic People\'s Republic of',
4854 'KR': 'Korea, Republic of',
4857 'LA': 'Lao People\'s Democratic Republic',
4863 'LI': 'Liechtenstein',
4867 'MK': 'Macedonia, the Former Yugoslav Republic of',
4874 'MH': 'Marshall Islands',
4880 'FM': 'Micronesia, Federated States of',
4881 'MD': 'Moldova, Republic of',
4892 'NL': 'Netherlands',
4893 'NC': 'New Caledonia',
4894 'NZ': 'New Zealand',
4899 'NF': 'Norfolk Island',
4900 'MP': 'Northern Mariana Islands',
4905 'PS': 'Palestine, State of',
4907 'PG': 'Papua New Guinea',
4910 'PH': 'Philippines',
4914 'PR': 'Puerto Rico',
4918 'RU': 'Russian Federation',
4920 'BL': 'Saint Barthélemy',
4921 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4922 'KN': 'Saint Kitts and Nevis',
4923 'LC': 'Saint Lucia',
4924 'MF': 'Saint Martin (French part)',
4925 'PM': 'Saint Pierre and Miquelon',
4926 'VC': 'Saint Vincent and the Grenadines',
4929 'ST': 'Sao Tome and Principe',
4930 'SA': 'Saudi Arabia',
4934 'SL': 'Sierra Leone',
4936 'SX': 'Sint Maarten (Dutch part)',
4939 'SB': 'Solomon Islands',
4941 'ZA': 'South Africa',
4942 'GS': 'South Georgia and the South Sandwich Islands',
4943 'SS': 'South Sudan',
4948 'SJ': 'Svalbard and Jan Mayen',
4951 'CH': 'Switzerland',
4952 'SY': 'Syrian Arab Republic',
4953 'TW': 'Taiwan, Province of China',
4955 'TZ': 'Tanzania, United Republic of',
4957 'TL': 'Timor-Leste',
4961 'TT': 'Trinidad and Tobago',
4964 'TM': 'Turkmenistan',
4965 'TC': 'Turks and Caicos Islands',
4969 'AE': 'United Arab Emirates',
4970 'GB': 'United Kingdom',
4971 'US': 'United States',
4972 'UM': 'United States Minor Outlying Islands',
4976 'VE': 'Venezuela, Bolivarian Republic of',
4978 'VG': 'Virgin Islands, British',
4979 'VI': 'Virgin Islands, U.S.',
4980 'WF': 'Wallis and Futuna',
4981 'EH': 'Western Sahara',
4988 def short2full(cls, code):
4989 """Convert an ISO 3166-2 country code to the corresponding full name"""
4990 return cls._country_map.get(code.upper())
4993 class GeoUtils(object):
4994 # Major IPv4 address blocks per country
4996 'AD': '46.172.224.0/19',
4997 'AE': '94.200.0.0/13',
4998 'AF': '149.54.0.0/17',
4999 'AG': '209.59.64.0/18',
5000 'AI': '204.14.248.0/21',
5001 'AL': '46.99.0.0/16',
5002 'AM': '46.70.0.0/15',
5003 'AO': '105.168.0.0/13',
5004 'AP': '182.50.184.0/21',
5005 'AQ': '23.154.160.0/24',
5006 'AR': '181.0.0.0/12',
5007 'AS': '202.70.112.0/20',
5008 'AT': '77.116.0.0/14',
5009 'AU': '1.128.0.0/11',
5010 'AW': '181.41.0.0/18',
5011 'AX': '185.217.4.0/22',
5012 'AZ': '5.197.0.0/16',
5013 'BA': '31.176.128.0/17',
5014 'BB': '65.48.128.0/17',
5015 'BD': '114.130.0.0/16',
5017 'BF': '102.178.0.0/15',
5018 'BG': '95.42.0.0/15',
5019 'BH': '37.131.0.0/17',
5020 'BI': '154.117.192.0/18',
5021 'BJ': '137.255.0.0/16',
5022 'BL': '185.212.72.0/23',
5023 'BM': '196.12.64.0/18',
5024 'BN': '156.31.0.0/16',
5025 'BO': '161.56.0.0/16',
5026 'BQ': '161.0.80.0/20',
5027 'BR': '191.128.0.0/12',
5028 'BS': '24.51.64.0/18',
5029 'BT': '119.2.96.0/19',
5030 'BW': '168.167.0.0/16',
5031 'BY': '178.120.0.0/13',
5032 'BZ': '179.42.192.0/18',
5033 'CA': '99.224.0.0/11',
5034 'CD': '41.243.0.0/16',
5035 'CF': '197.242.176.0/21',
5036 'CG': '160.113.0.0/16',
5037 'CH': '85.0.0.0/13',
5038 'CI': '102.136.0.0/14',
5039 'CK': '202.65.32.0/19',
5040 'CL': '152.172.0.0/14',
5041 'CM': '102.244.0.0/14',
5042 'CN': '36.128.0.0/10',
5043 'CO': '181.240.0.0/12',
5044 'CR': '201.192.0.0/12',
5045 'CU': '152.206.0.0/15',
5046 'CV': '165.90.96.0/19',
5047 'CW': '190.88.128.0/17',
5048 'CY': '31.153.0.0/16',
5049 'CZ': '88.100.0.0/14',
5051 'DJ': '197.241.0.0/17',
5052 'DK': '87.48.0.0/12',
5053 'DM': '192.243.48.0/20',
5054 'DO': '152.166.0.0/15',
5055 'DZ': '41.96.0.0/12',
5056 'EC': '186.68.0.0/15',
5057 'EE': '90.190.0.0/15',
5058 'EG': '156.160.0.0/11',
5059 'ER': '196.200.96.0/20',
5060 'ES': '88.0.0.0/11',
5061 'ET': '196.188.0.0/14',
5062 'EU': '2.16.0.0/13',
5063 'FI': '91.152.0.0/13',
5064 'FJ': '144.120.0.0/16',
5065 'FK': '80.73.208.0/21',
5066 'FM': '119.252.112.0/20',
5067 'FO': '88.85.32.0/19',
5069 'GA': '41.158.0.0/15',
5071 'GD': '74.122.88.0/21',
5072 'GE': '31.146.0.0/16',
5073 'GF': '161.22.64.0/18',
5074 'GG': '62.68.160.0/19',
5075 'GH': '154.160.0.0/12',
5076 'GI': '95.164.0.0/16',
5077 'GL': '88.83.0.0/19',
5078 'GM': '160.182.0.0/15',
5079 'GN': '197.149.192.0/18',
5080 'GP': '104.250.0.0/19',
5081 'GQ': '105.235.224.0/20',
5082 'GR': '94.64.0.0/13',
5083 'GT': '168.234.0.0/16',
5084 'GU': '168.123.0.0/16',
5085 'GW': '197.214.80.0/20',
5086 'GY': '181.41.64.0/18',
5087 'HK': '113.252.0.0/14',
5088 'HN': '181.210.0.0/16',
5089 'HR': '93.136.0.0/13',
5090 'HT': '148.102.128.0/17',
5091 'HU': '84.0.0.0/14',
5092 'ID': '39.192.0.0/10',
5093 'IE': '87.32.0.0/12',
5094 'IL': '79.176.0.0/13',
5095 'IM': '5.62.80.0/20',
5096 'IN': '117.192.0.0/10',
5097 'IO': '203.83.48.0/21',
5098 'IQ': '37.236.0.0/14',
5099 'IR': '2.176.0.0/12',
5100 'IS': '82.221.0.0/16',
5101 'IT': '79.0.0.0/10',
5102 'JE': '87.244.64.0/18',
5103 'JM': '72.27.0.0/17',
5104 'JO': '176.29.0.0/16',
5105 'JP': '133.0.0.0/8',
5106 'KE': '105.48.0.0/12',
5107 'KG': '158.181.128.0/17',
5108 'KH': '36.37.128.0/17',
5109 'KI': '103.25.140.0/22',
5110 'KM': '197.255.224.0/20',
5111 'KN': '198.167.192.0/19',
5112 'KP': '175.45.176.0/22',
5113 'KR': '175.192.0.0/10',
5114 'KW': '37.36.0.0/14',
5115 'KY': '64.96.0.0/15',
5116 'KZ': '2.72.0.0/13',
5117 'LA': '115.84.64.0/18',
5118 'LB': '178.135.0.0/16',
5119 'LC': '24.92.144.0/20',
5120 'LI': '82.117.0.0/19',
5121 'LK': '112.134.0.0/15',
5122 'LR': '102.183.0.0/16',
5123 'LS': '129.232.0.0/17',
5124 'LT': '78.56.0.0/13',
5125 'LU': '188.42.0.0/16',
5126 'LV': '46.109.0.0/16',
5127 'LY': '41.252.0.0/14',
5128 'MA': '105.128.0.0/11',
5129 'MC': '88.209.64.0/18',
5130 'MD': '37.246.0.0/16',
5131 'ME': '178.175.0.0/17',
5132 'MF': '74.112.232.0/21',
5133 'MG': '154.126.0.0/17',
5134 'MH': '117.103.88.0/21',
5135 'MK': '77.28.0.0/15',
5136 'ML': '154.118.128.0/18',
5137 'MM': '37.111.0.0/17',
5138 'MN': '49.0.128.0/17',
5139 'MO': '60.246.0.0/16',
5140 'MP': '202.88.64.0/20',
5141 'MQ': '109.203.224.0/19',
5142 'MR': '41.188.64.0/18',
5143 'MS': '208.90.112.0/22',
5144 'MT': '46.11.0.0/16',
5145 'MU': '105.16.0.0/12',
5146 'MV': '27.114.128.0/18',
5147 'MW': '102.70.0.0/15',
5148 'MX': '187.192.0.0/11',
5149 'MY': '175.136.0.0/13',
5150 'MZ': '197.218.0.0/15',
5151 'NA': '41.182.0.0/16',
5152 'NC': '101.101.0.0/18',
5153 'NE': '197.214.0.0/18',
5154 'NF': '203.17.240.0/22',
5155 'NG': '105.112.0.0/12',
5156 'NI': '186.76.0.0/15',
5157 'NL': '145.96.0.0/11',
5158 'NO': '84.208.0.0/13',
5159 'NP': '36.252.0.0/15',
5160 'NR': '203.98.224.0/19',
5161 'NU': '49.156.48.0/22',
5162 'NZ': '49.224.0.0/14',
5163 'OM': '5.36.0.0/15',
5164 'PA': '186.72.0.0/15',
5165 'PE': '186.160.0.0/14',
5166 'PF': '123.50.64.0/18',
5167 'PG': '124.240.192.0/19',
5168 'PH': '49.144.0.0/13',
5169 'PK': '39.32.0.0/11',
5170 'PL': '83.0.0.0/11',
5171 'PM': '70.36.0.0/20',
5172 'PR': '66.50.0.0/16',
5173 'PS': '188.161.0.0/16',
5174 'PT': '85.240.0.0/13',
5175 'PW': '202.124.224.0/20',
5176 'PY': '181.120.0.0/14',
5177 'QA': '37.210.0.0/15',
5178 'RE': '102.35.0.0/16',
5179 'RO': '79.112.0.0/13',
5180 'RS': '93.86.0.0/15',
5181 'RU': '5.136.0.0/13',
5182 'RW': '41.186.0.0/16',
5183 'SA': '188.48.0.0/13',
5184 'SB': '202.1.160.0/19',
5185 'SC': '154.192.0.0/11',
5186 'SD': '102.120.0.0/13',
5187 'SE': '78.64.0.0/12',
5188 'SG': '8.128.0.0/10',
5189 'SI': '188.196.0.0/14',
5190 'SK': '78.98.0.0/15',
5191 'SL': '102.143.0.0/17',
5192 'SM': '89.186.32.0/19',
5193 'SN': '41.82.0.0/15',
5194 'SO': '154.115.192.0/18',
5195 'SR': '186.179.128.0/17',
5196 'SS': '105.235.208.0/21',
5197 'ST': '197.159.160.0/19',
5198 'SV': '168.243.0.0/16',
5199 'SX': '190.102.0.0/20',
5201 'SZ': '41.84.224.0/19',
5202 'TC': '65.255.48.0/20',
5203 'TD': '154.68.128.0/19',
5204 'TG': '196.168.0.0/14',
5205 'TH': '171.96.0.0/13',
5206 'TJ': '85.9.128.0/18',
5207 'TK': '27.96.24.0/21',
5208 'TL': '180.189.160.0/20',
5209 'TM': '95.85.96.0/19',
5210 'TN': '197.0.0.0/11',
5211 'TO': '175.176.144.0/21',
5212 'TR': '78.160.0.0/11',
5213 'TT': '186.44.0.0/15',
5214 'TV': '202.2.96.0/19',
5215 'TW': '120.96.0.0/11',
5216 'TZ': '156.156.0.0/14',
5217 'UA': '37.52.0.0/14',
5218 'UG': '102.80.0.0/13',
5220 'UY': '167.56.0.0/13',
5221 'UZ': '84.54.64.0/18',
5222 'VA': '212.77.0.0/19',
5223 'VC': '207.191.240.0/21',
5224 'VE': '186.88.0.0/13',
5225 'VG': '66.81.192.0/20',
5226 'VI': '146.226.0.0/16',
5227 'VN': '14.160.0.0/11',
5228 'VU': '202.80.32.0/20',
5229 'WF': '117.20.32.0/21',
5230 'WS': '202.4.32.0/19',
5231 'YE': '134.35.0.0/16',
5232 'YT': '41.242.116.0/22',
5233 'ZA': '41.0.0.0/11',
5234 'ZM': '102.144.0.0/13',
5235 'ZW': '102.177.192.0/18',
5239 def random_ipv4(cls, code_or_block):
5240 if len(code_or_block) == 2:
5241 block = cls._country_ip_map.get(code_or_block.upper())
5245 block = code_or_block
5246 addr, preflen = block.split('/')
5247 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5248 addr_max = addr_min | (0xffffffff >> int(preflen))
5249 return compat_str(socket.inet_ntoa(
5250 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5253 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5254 def __init__(self, proxies=None):
5255 # Set default handlers
5256 for type in ('http', 'https'):
5257 setattr(self, '%s_open' % type,
5258 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5259 meth(r, proxy, type))
5260 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5262 def proxy_open(self, req, proxy, type):
5263 req_proxy = req.headers.get('Ytdl-request-proxy')
5264 if req_proxy is not None:
5266 del req.headers['Ytdl-request-proxy']
5268 if proxy == '__noproxy__':
5269 return None # No Proxy
5270 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5271 req.add_header('Ytdl-socks-proxy', proxy)
5272 # youtube-dl's http/https handlers do wrapping the socket with socks
5274 return compat_urllib_request.ProxyHandler.proxy_open(
5275 self, req, proxy, type)
5278 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5279 # released into Public Domain
5280 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5282 def long_to_bytes(n, blocksize=0):
5283 """long_to_bytes(n:long, blocksize:int) : string
5284 Convert a long integer to a byte string.
5286 If optional blocksize is given and greater than zero, pad the front of the
5287 byte string with binary zeros so that the length is a multiple of
5290 # after much testing, this algorithm was deemed to be the fastest
5294 s = compat_struct_pack('>I', n & 0xffffffff) + s
5296 # strip off leading zeros
5297 for i in range(len(s)):
5298 if s[i] != b'\000'[0]:
5301 # only happens when n == 0
5305 # add back some pad bytes. this could be done more efficiently w.r.t. the
5306 # de-padding being done above, but sigh...
5307 if blocksize > 0 and len(s) % blocksize:
5308 s = (blocksize - len(s) % blocksize) * b'\000' + s
5312 def bytes_to_long(s):
5313 """bytes_to_long(string) : long
5314 Convert a byte string to a long integer.
5316 This is (essentially) the inverse of long_to_bytes().
5321 extra = (4 - length % 4)
5322 s = b'\000' * extra + s
5323 length = length + extra
5324 for i in range(0, length, 4):
5325 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5329 def ohdave_rsa_encrypt(data, exponent, modulus):
5331 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5334 data: data to encrypt, bytes-like object
5335 exponent, modulus: parameter e and N of RSA algorithm, both integer
5336 Output: hex string of encrypted data
5338 Limitation: supports one block encryption only
5341 payload = int(binascii.hexlify(data[::-1]), 16)
5342 encrypted = pow(payload, exponent, modulus)
5343 return '%x' % encrypted
5346 def pkcs1pad(data, length):
5348 Padding input data with PKCS#1 scheme
5350 @param {int[]} data input data
5351 @param {int} length target length
5352 @returns {int[]} padded data
5354 if len(data) > length - 11:
5355 raise ValueError('Input data too long for PKCS#1 padding')
5357 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5358 return [0, 2] + pseudo_random + [0] + data
5361 def encode_base_n(num, n, table=None):
5362 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5364 table = FULL_TABLE[:n]
5367 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
5374 ret = table[num % n] + ret
5379 def decode_packed_codes(code):
5380 mobj = re.search(PACKED_CODES_RE, code)
5381 obfucasted_code, base, count, symbols = mobj.groups()
5384 symbols = symbols.split('|')
5389 base_n_count = encode_base_n(count, base)
5390 symbol_table[base_n_count] = symbols[count] or base_n_count
5393 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5397 def caesar(s, alphabet, shift):
5402 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5407 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5410 def parse_m3u8_attributes(attrib):
5412 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5413 if val.startswith('"'):
5419 def urshift(val, n):
5420 return val >> n if val >= 0 else (val + 0x100000000) >> n
5423 # Based on png2str() written by @gdkchan and improved by @yokrysty
5424 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5425 def decode_png(png_data):
5426 # Reference: https://www.w3.org/TR/PNG/
5427 header = png_data[8:]
5429 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5430 raise IOError('Not a valid PNG file.')
5432 int_map = {1: '>B', 2: '>H', 4: '>I'}
5433 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
5438 length = unpack_integer(header[:4])
5441 chunk_type = header[:4]
5444 chunk_data = header[:length]
5445 header = header[length:]
5447 header = header[4:] # Skip CRC
5455 ihdr = chunks[0]['data']
5457 width = unpack_integer(ihdr[:4])
5458 height = unpack_integer(ihdr[4:8])
5462 for chunk in chunks:
5463 if chunk['type'] == b'IDAT':
5464 idat += chunk['data']
5467 raise IOError('Unable to read PNG data.')
5469 decompressed_data = bytearray(zlib.decompress(idat))
5474 def _get_pixel(idx):
5479 for y in range(height):
5480 basePos = y * (1 + stride)
5481 filter_type = decompressed_data[basePos]
5485 pixels.append(current_row)
5487 for x in range(stride):
5488 color = decompressed_data[1 + basePos + x]
5489 basex = y * stride + x
5494 left = _get_pixel(basex - 3)
5496 up = _get_pixel(basex - stride)
5498 if filter_type == 1: # Sub
5499 color = (color + left) & 0xff
5500 elif filter_type == 2: # Up
5501 color = (color + up) & 0xff
5502 elif filter_type == 3: # Average
5503 color = (color + ((left + up) >> 1)) & 0xff
5504 elif filter_type == 4: # Paeth
5510 c = _get_pixel(basex - stride - 3)
5518 if pa <= pb and pa <= pc:
5519 color = (color + a) & 0xff
5521 color = (color + b) & 0xff
5523 color = (color + c) & 0xff
5525 current_row.append(color)
5527 return width, height, pixels
5530 def write_xattr(path, key, value):
5531 # This mess below finds the best xattr tool for the job
5533 # try the pyxattr module...
5536 if hasattr(xattr, 'set'): # pyxattr
5537 # Unicode arguments are not supported in python-pyxattr until
5539 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5540 pyxattr_required_version = '0.5.0'
5541 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
5542 # TODO: fallback to CLI tools
5543 raise XAttrUnavailableError(
5544 'python-pyxattr is detected but is too old. '
5545 'youtube-dl requires %s or above while your version is %s. '
5546 'Falling back to other xattr implementations' % (
5547 pyxattr_required_version, xattr.__version__))
5549 setxattr = xattr.set
5551 setxattr = xattr.setxattr
5554 setxattr(path, key, value)
5555 except EnvironmentError as e:
5556 raise XAttrMetadataError(e.errno, e.strerror)
5559 if compat_os_name == 'nt':
5560 # Write xattrs to NTFS Alternate Data Streams:
5561 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5562 assert ':' not in key
5563 assert os.path.exists(path)
5565 ads_fn = path + ':' + key
5567 with open(ads_fn, 'wb') as f:
5569 except EnvironmentError as e:
5570 raise XAttrMetadataError(e.errno, e.strerror)
5572 user_has_setfattr = check_executable('setfattr', ['--version'])
5573 user_has_xattr = check_executable('xattr', ['-h'])
5575 if user_has_setfattr or user_has_xattr:
5577 value = value.decode('utf-8')
5578 if user_has_setfattr:
5579 executable = 'setfattr'
5580 opts = ['-n', key, '-v', value]
5581 elif user_has_xattr:
5582 executable = 'xattr'
5583 opts = ['-w', key, value]
5585 cmd = ([encodeFilename(executable, True)]
5586 + [encodeArgument(o) for o in opts]
5587 + [encodeFilename(path, True)])
5590 p = subprocess.Popen(
5591 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5592 except EnvironmentError as e:
5593 raise XAttrMetadataError(e.errno, e.strerror)
5594 stdout, stderr = p.communicate()
5595 stderr = stderr.decode('utf-8', 'replace')
5596 if p.returncode != 0:
5597 raise XAttrMetadataError(p.returncode, stderr)
5600 # On Unix, and can't find pyxattr, setfattr, or xattr.
5601 if sys.platform.startswith('linux'):
5602 raise XAttrUnavailableError(
5603 "Couldn't find a tool to set the xattrs. "
5604 "Install either the python 'pyxattr' or 'xattr' "
5605 "modules, or the GNU 'attr' package "
5606 "(which contains the 'setfattr' tool).")
5608 raise XAttrUnavailableError(
5609 "Couldn't find a tool to set the xattrs. "
5610 "Install either the python 'xattr' module, "
5611 "or the 'xattr' binary.")
5614 def random_birthday(year_field, month_field, day_field):
5615 start_date = datetime.date(1950, 1, 1)
5616 end_date = datetime.date(1995, 12, 31)
5617 offset = random.randint(0, (end_date - start_date).days)
5618 random_date = start_date + datetime.timedelta(offset)
5620 year_field: str(random_date.year),
5621 month_field: str(random_date.month),
5622 day_field: str(random_date.day),