4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
96 'en': ENGLISH_MONTH_NAMES,
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
115 'f4f', 'f4m', 'm3u8', 'smil')
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
136 '%Y-%m-%d %H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
143 '%Y-%m-%dT%H:%M:%S.%f',
146 '%b %d %Y at %H:%M:%S',
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
169 def preferredencoding():
170 """Get preferred encoding.
172 Returns the best encoding scheme for the system, based on
173 locale.getpreferredencoding() and some further tweaks.
176 pref = locale.getpreferredencoding()
184 def write_json_file(obj, fn):
185 """ Encode obj as JSON and write it to fn, atomically if possible """
187 fn = encodeFilename(fn)
188 if sys.version_info < (3, 0) and sys.platform != 'win32':
189 encoding = get_filesystem_encoding()
190 # os.path.basename returns a bytes object, but NamedTemporaryFile
191 # will fail if the filename contains non ascii characters unless we
192 # use a unicode object
193 path_basename = lambda f: os.path.basename(fn).decode(encoding)
194 # the same for os.path.dirname
195 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
197 path_basename = os.path.basename
198 path_dirname = os.path.dirname
202 'prefix': path_basename(fn) + '.',
203 'dir': path_dirname(fn),
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys.version_info < (3, 0):
217 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
222 if sys.platform == 'win32':
223 # Need to remove existing file on Windows, else os.rename raises
224 # WindowsError or FileExistsError.
229 os.rename(tf.name, fn)
238 if sys.version_info >= (2, 7):
239 def find_xpath_attr(node, xpath, key, val=None):
240 """ Find the xpath xpath[@key=val] """
241 assert re.match(r'^[a-zA-Z_-]+$', key)
242 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
243 return node.find(expr)
245 def find_xpath_attr(node, xpath, key, val=None):
246 for f in node.findall(compat_xpath(xpath)):
247 if key not in f.attrib:
249 if val is None or f.attrib.get(key) == val:
253 # On python2.6 the xml.etree.ElementTree.Element methods don't support
254 # the namespace parameter
257 def xpath_with_ns(path, ns_map):
258 components = [c.split(':') for c in path.split('/')]
262 replaced.append(c[0])
265 replaced.append('{%s}%s' % (ns_map[ns], tag))
266 return '/'.join(replaced)
269 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
270 def _find_xpath(xpath):
271 return node.find(compat_xpath(xpath))
273 if isinstance(xpath, (str, compat_str)):
274 n = _find_xpath(xpath)
282 if default is not NO_DEFAULT:
285 name = xpath if name is None else name
286 raise ExtractorError('Could not find XML element %s' % name)
292 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
293 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
294 if n is None or n == default:
297 if default is not NO_DEFAULT:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element\'s text %s' % name)
307 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
308 n = find_xpath_attr(node, xpath, key)
310 if default is not NO_DEFAULT:
313 name = '%s[@%s]' % (xpath, key) if name is None else name
314 raise ExtractorError('Could not find XML attribute %s' % name)
320 def get_element_by_id(id, html):
321 """Return the content of the tag with the specified ID in the passed HTML document"""
322 return get_element_by_attribute('id', id, html)
325 def get_element_by_class(class_name, html):
326 return get_element_by_attribute(
327 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
328 html, escape_value=False)
331 def get_element_by_attribute(attribute, value, html, escape_value=True):
332 """Return the content of the tag with the specified attribute in the passed HTML document"""
334 value = re.escape(value) if escape_value else value
336 m = re.search(r'''(?xs)
338 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
344 ''' % (re.escape(attribute), value), html)
348 res = m.group('content')
350 if res.startswith('"') or res.startswith("'"):
353 return unescapeHTML(res)
356 class HTMLAttributeParser(compat_HTMLParser):
357 """Trivial HTML parser to gather the attributes for a single element"""
360 compat_HTMLParser.__init__(self)
362 def handle_starttag(self, tag, attrs):
363 self.attrs = dict(attrs)
366 def extract_attributes(html_element):
367 """Given a string for an HTML element such as
369 a="foo" B="bar" c="&98;az" d=boz
370 empty= noval entity="&"
373 Decode and return a dictionary of attributes.
375 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
376 'empty': '', 'noval': None, 'entity': '&',
377 'sq': '"', 'dq': '\''
379 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
380 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
382 parser = HTMLAttributeParser()
383 parser.feed(html_element)
388 def clean_html(html):
389 """Clean an HTML snippet into a readable string"""
391 if html is None: # Convenience for sanitizing descriptions etc.
395 html = html.replace('\n', ' ')
396 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
397 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
399 html = re.sub('<.*?>', '', html)
400 # Replace html entities
401 html = unescapeHTML(html)
405 def sanitize_open(filename, open_mode):
406 """Try to open the given filename, and slightly tweak it if this fails.
408 Attempts to open the given filename. If this fails, it tries to change
409 the filename slightly, step by step, until it's either able to open it
410 or it fails and raises a final exception, like the standard open()
413 It returns the tuple (stream, definitive_file_name).
417 if sys.platform == 'win32':
419 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
420 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
421 stream = open(encodeFilename(filename), open_mode)
422 return (stream, filename)
423 except (IOError, OSError) as err:
424 if err.errno in (errno.EACCES,):
427 # In case of error, try to remove win32 forbidden chars
428 alt_filename = sanitize_path(filename)
429 if alt_filename == filename:
432 # An exception here should be caught in the caller
433 stream = open(encodeFilename(alt_filename), open_mode)
434 return (stream, alt_filename)
437 def timeconvert(timestr):
438 """Convert RFC 2822 defined time string into system timestamp"""
440 timetuple = email.utils.parsedate_tz(timestr)
441 if timetuple is not None:
442 timestamp = email.utils.mktime_tz(timetuple)
446 def sanitize_filename(s, restricted=False, is_id=False):
447 """Sanitizes a string so it could be used as part of a filename.
448 If restricted is set, use a stricter subset of allowed characters.
449 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
451 def replace_insane(char):
452 if restricted and char in ACCENT_CHARS:
453 return ACCENT_CHARS[char]
454 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return '' if restricted else '\''
459 return '_-' if restricted else ' -'
460 elif char in '\\/|*<>':
462 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
464 if restricted and ord(char) > 127:
469 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
470 result = ''.join(map(replace_insane, s))
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
478 if result.startswith('-'):
479 result = '_' + result[len('-'):]
480 result = result.lstrip('.')
486 def sanitize_path(s):
487 """Sanitizes and normalizes path on Windows"""
488 if sys.platform != 'win32':
490 drive_or_unc, _ = os.path.splitdrive(s)
491 if sys.version_info < (2, 7) and not drive_or_unc:
492 drive_or_unc, _ = os.path.splitunc(s)
493 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
497 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
498 for path_part in norm_path]
500 sanitized_path.insert(0, drive_or_unc + os.path.sep)
501 return os.path.join(*sanitized_path)
504 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
505 # unwanted failures due to missing protocol
506 def sanitize_url(url):
507 return 'http:%s' % url if url.startswith('//') else url
510 def sanitized_Request(url, *args, **kwargs):
511 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
514 def orderedSet(iterable):
515 """ Remove all duplicates from the input iterable """
523 def _htmlentity_transform(entity_with_semicolon):
524 """Transforms an HTML entity to a character."""
525 entity = entity_with_semicolon[:-1]
527 # Known non-numeric HTML entity
528 if entity in compat_html_entities.name2codepoint:
529 return compat_chr(compat_html_entities.name2codepoint[entity])
531 # TODO: HTML5 allows entities without a semicolon. For example,
532 # 'Éric' should be decoded as 'Éric'.
533 if entity_with_semicolon in compat_html_entities_html5:
534 return compat_html_entities_html5[entity_with_semicolon]
536 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
538 numstr = mobj.group(1)
539 if numstr.startswith('x'):
541 numstr = '0%s' % numstr
544 # See https://github.com/rg3/youtube-dl/issues/7518
546 return compat_chr(int(numstr, base))
550 # Unknown entity in name, return its literal representation
551 return '&%s;' % entity
557 assert type(s) == compat_str
560 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
563 def get_subprocess_encoding():
564 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
565 # For subprocess calls, encode with locale encoding
566 # Refer to http://stackoverflow.com/a/9951851/35070
567 encoding = preferredencoding()
569 encoding = sys.getfilesystemencoding()
575 def encodeFilename(s, for_subprocess=False):
577 @param s The name of the file
580 assert type(s) == compat_str
582 # Python 3 has a Unicode API
583 if sys.version_info >= (3, 0):
586 # Pass '' directly to use Unicode APIs on Windows 2000 and up
587 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
588 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
589 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
592 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
593 if sys.platform.startswith('java'):
596 return s.encode(get_subprocess_encoding(), 'ignore')
599 def decodeFilename(b, for_subprocess=False):
601 if sys.version_info >= (3, 0):
604 if not isinstance(b, bytes):
607 return b.decode(get_subprocess_encoding(), 'ignore')
610 def encodeArgument(s):
611 if not isinstance(s, compat_str):
612 # Legacy code that uses byte strings
613 # Uncomment the following line after fixing all post processors
614 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
615 s = s.decode('ascii')
616 return encodeFilename(s, True)
619 def decodeArgument(b):
620 return decodeFilename(b, True)
623 def decodeOption(optval):
626 if isinstance(optval, bytes):
627 optval = optval.decode(preferredencoding())
629 assert isinstance(optval, compat_str)
633 def formatSeconds(secs):
635 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
637 return '%d:%02d' % (secs // 60, secs % 60)
642 def make_HTTPS_handler(params, **kwargs):
643 opts_no_check_certificate = params.get('nocheckcertificate', False)
644 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
645 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
646 if opts_no_check_certificate:
647 context.check_hostname = False
648 context.verify_mode = ssl.CERT_NONE
650 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
653 # (create_default_context present but HTTPSHandler has no context=)
656 if sys.version_info < (3, 2):
657 return YoutubeDLHTTPSHandler(params, **kwargs)
659 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
660 context.verify_mode = (ssl.CERT_NONE
661 if opts_no_check_certificate
662 else ssl.CERT_REQUIRED)
663 context.set_default_verify_paths()
664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
667 def bug_reports_message():
668 if ytdl_is_updateable():
669 update_cmd = 'type youtube-dl -U to update'
671 update_cmd = 'see https://yt-dl.org/update on how to update'
672 msg = '; please report this issue on https://yt-dl.org/bug .'
673 msg += ' Make sure you are using the latest version; %s.' % update_cmd
674 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
678 class ExtractorError(Exception):
679 """Error during info extraction."""
681 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
682 """ tb, if given, is the original traceback (so that it can be printed out).
683 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
686 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
688 if video_id is not None:
689 msg = video_id + ': ' + msg
691 msg += ' (caused by %r)' % cause
693 msg += bug_reports_message()
694 super(ExtractorError, self).__init__(msg)
697 self.exc_info = sys.exc_info() # preserve original exception
699 self.video_id = video_id
701 def format_traceback(self):
702 if self.traceback is None:
704 return ''.join(traceback.format_tb(self.traceback))
707 class UnsupportedError(ExtractorError):
708 def __init__(self, url):
709 super(UnsupportedError, self).__init__(
710 'Unsupported URL: %s' % url, expected=True)
714 class RegexNotFoundError(ExtractorError):
715 """Error when a regex didn't match"""
719 class DownloadError(Exception):
720 """Download Error exception.
722 This exception may be thrown by FileDownloader objects if they are not
723 configured to continue on errors. They will contain the appropriate
727 def __init__(self, msg, exc_info=None):
728 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
729 super(DownloadError, self).__init__(msg)
730 self.exc_info = exc_info
733 class SameFileError(Exception):
734 """Same File exception.
736 This exception will be thrown by FileDownloader objects if they detect
737 multiple files would have to be downloaded to the same file on disk.
742 class PostProcessingError(Exception):
743 """Post Processing exception.
745 This exception may be raised by PostProcessor's .run() method to
746 indicate an error in the postprocessing task.
749 def __init__(self, msg):
753 class MaxDownloadsReached(Exception):
754 """ --max-downloads limit has been reached. """
758 class UnavailableVideoError(Exception):
759 """Unavailable Format exception.
761 This exception will be thrown when a video is requested
762 in a format that is not available for that video.
767 class ContentTooShortError(Exception):
768 """Content Too Short exception.
770 This exception may be raised by FileDownloader objects when a file they
771 download is too small for what the server announced first, indicating
772 the connection was probably interrupted.
775 def __init__(self, downloaded, expected):
777 self.downloaded = downloaded
778 self.expected = expected
781 class XAttrMetadataError(Exception):
782 def __init__(self, code=None, msg='Unknown error'):
783 super(XAttrMetadataError, self).__init__(msg)
787 # Parsing code and msg
788 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
789 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
790 self.reason = 'NO_SPACE'
791 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
792 self.reason = 'VALUE_TOO_LONG'
794 self.reason = 'NOT_SUPPORTED'
797 class XAttrUnavailableError(Exception):
801 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
802 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
803 # expected HTTP responses to meet HTTP/1.0 or later (see also
804 # https://github.com/rg3/youtube-dl/issues/6727)
805 if sys.version_info < (3, 0):
806 kwargs[b'strict'] = True
807 hc = http_class(*args, **kwargs)
808 source_address = ydl_handler._params.get('source_address')
809 if source_address is not None:
810 sa = (source_address, 0)
811 if hasattr(hc, 'source_address'): # Python 2.7+
812 hc.source_address = sa
814 def _hc_connect(self, *args, **kwargs):
815 sock = compat_socket_create_connection(
816 (self.host, self.port), self.timeout, sa)
818 self.sock = ssl.wrap_socket(
819 sock, self.key_file, self.cert_file,
820 ssl_version=ssl.PROTOCOL_TLSv1)
823 hc.connect = functools.partial(_hc_connect, hc)
828 def handle_youtubedl_headers(headers):
829 filtered_headers = headers
831 if 'Youtubedl-no-compression' in filtered_headers:
832 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
833 del filtered_headers['Youtubedl-no-compression']
835 return filtered_headers
838 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
839 """Handler for HTTP requests and responses.
841 This class, when installed with an OpenerDirector, automatically adds
842 the standard headers to every HTTP request and handles gzipped and
843 deflated responses from web servers. If compression is to be avoided in
844 a particular request, the original request in the program code only has
845 to include the HTTP header "Youtubedl-no-compression", which will be
846 removed before making the real request.
848 Part of this code was copied from:
850 http://techknack.net/python-urllib2-handlers/
852 Andrew Rowls, the author of that code, agreed to release it to the
856 def __init__(self, params, *args, **kwargs):
857 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
858 self._params = params
860 def http_open(self, req):
861 conn_class = compat_http_client.HTTPConnection
863 socks_proxy = req.headers.get('Ytdl-socks-proxy')
865 conn_class = make_socks_conn_class(conn_class, socks_proxy)
866 del req.headers['Ytdl-socks-proxy']
868 return self.do_open(functools.partial(
869 _create_http_connection, self, conn_class, False),
875 return zlib.decompress(data, -zlib.MAX_WBITS)
877 return zlib.decompress(data)
880 def addinfourl_wrapper(stream, headers, url, code):
881 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
882 return compat_urllib_request.addinfourl(stream, headers, url, code)
883 ret = compat_urllib_request.addinfourl(stream, headers, url)
887 def http_request(self, req):
888 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
889 # always respected by websites, some tend to give out URLs with non percent-encoded
890 # non-ASCII characters (see telemb.py, ard.py [#3412])
891 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
892 # To work around aforementioned issue we will replace request's original URL with
893 # percent-encoded one
894 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
895 # the code of this workaround has been moved here from YoutubeDL.urlopen()
896 url = req.get_full_url()
897 url_escaped = escape_url(url)
899 # Substitute URL if any change after escaping
900 if url != url_escaped:
901 req = update_Request(req, url=url_escaped)
903 for h, v in std_headers.items():
904 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
905 # The dict keys are capitalized because of this bug by urllib
906 if h.capitalize() not in req.headers:
909 req.headers = handle_youtubedl_headers(req.headers)
911 if sys.version_info < (2, 7) and '#' in req.get_full_url():
912 # Python 2.6 is brain-dead when it comes to fragments
913 req._Request__original = req._Request__original.partition('#')[0]
914 req._Request__r_type = req._Request__r_type.partition('#')[0]
918 def http_response(self, req, resp):
921 if resp.headers.get('Content-encoding', '') == 'gzip':
922 content = resp.read()
923 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
925 uncompressed = io.BytesIO(gz.read())
926 except IOError as original_ioerror:
927 # There may be junk add the end of the file
928 # See http://stackoverflow.com/q/4928560/35070 for details
929 for i in range(1, 1024):
931 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
932 uncompressed = io.BytesIO(gz.read())
937 raise original_ioerror
938 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
939 resp.msg = old_resp.msg
940 del resp.headers['Content-encoding']
942 if resp.headers.get('Content-encoding', '') == 'deflate':
943 gz = io.BytesIO(self.deflate(resp.read()))
944 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
945 resp.msg = old_resp.msg
946 del resp.headers['Content-encoding']
947 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
948 # https://github.com/rg3/youtube-dl/issues/6457).
949 if 300 <= resp.code < 400:
950 location = resp.headers.get('Location')
952 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
953 if sys.version_info >= (3, 0):
954 location = location.encode('iso-8859-1').decode('utf-8')
956 location = location.decode('utf-8')
957 location_escaped = escape_url(location)
958 if location != location_escaped:
959 del resp.headers['Location']
960 if sys.version_info < (3, 0):
961 location_escaped = location_escaped.encode('utf-8')
962 resp.headers['Location'] = location_escaped
965 https_request = http_request
966 https_response = http_response
969 def make_socks_conn_class(base_class, socks_proxy):
970 assert issubclass(base_class, (
971 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
973 url_components = compat_urlparse.urlparse(socks_proxy)
974 if url_components.scheme.lower() == 'socks5':
975 socks_type = ProxyType.SOCKS5
976 elif url_components.scheme.lower() in ('socks', 'socks4'):
977 socks_type = ProxyType.SOCKS4
978 elif url_components.scheme.lower() == 'socks4a':
979 socks_type = ProxyType.SOCKS4A
981 def unquote_if_non_empty(s):
984 return compat_urllib_parse_unquote_plus(s)
988 url_components.hostname, url_components.port or 1080,
990 unquote_if_non_empty(url_components.username),
991 unquote_if_non_empty(url_components.password),
994 class SocksConnection(base_class):
996 self.sock = sockssocket()
997 self.sock.setproxy(*proxy_args)
998 if type(self.timeout) in (int, float):
999 self.sock.settimeout(self.timeout)
1000 self.sock.connect((self.host, self.port))
1002 if isinstance(self, compat_http_client.HTTPSConnection):
1003 if hasattr(self, '_context'): # Python > 2.6
1004 self.sock = self._context.wrap_socket(
1005 self.sock, server_hostname=self.host)
1007 self.sock = ssl.wrap_socket(self.sock)
1009 return SocksConnection
1012 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1013 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1014 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1015 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1016 self._params = params
1018 def https_open(self, req):
1020 conn_class = self._https_conn_class
1022 if hasattr(self, '_context'): # python > 2.6
1023 kwargs['context'] = self._context
1024 if hasattr(self, '_check_hostname'): # python 3.x
1025 kwargs['check_hostname'] = self._check_hostname
1027 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1029 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1030 del req.headers['Ytdl-socks-proxy']
1032 return self.do_open(functools.partial(
1033 _create_http_connection, self, conn_class, True),
1037 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1038 def __init__(self, cookiejar=None):
1039 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1041 def http_response(self, request, response):
1042 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1043 # characters in Set-Cookie HTTP header of last response (see
1044 # https://github.com/rg3/youtube-dl/issues/6769).
1045 # In order to at least prevent crashing we will percent encode Set-Cookie
1046 # header before HTTPCookieProcessor starts processing it.
1047 # if sys.version_info < (3, 0) and response.headers:
1048 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1049 # set_cookie = response.headers.get(set_cookie_header)
1051 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1052 # if set_cookie != set_cookie_escaped:
1053 # del response.headers[set_cookie_header]
1054 # response.headers[set_cookie_header] = set_cookie_escaped
1055 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1057 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1058 https_response = http_response
1061 def extract_timezone(date_str):
1063 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1066 timezone = datetime.timedelta()
1068 date_str = date_str[:-len(m.group('tz'))]
1069 if not m.group('sign'):
1070 timezone = datetime.timedelta()
1072 sign = 1 if m.group('sign') == '+' else -1
1073 timezone = datetime.timedelta(
1074 hours=sign * int(m.group('hours')),
1075 minutes=sign * int(m.group('minutes')))
1076 return timezone, date_str
1079 def parse_iso8601(date_str, delimiter='T', timezone=None):
1080 """ Return a UNIX timestamp from the given date """
1082 if date_str is None:
1085 date_str = re.sub(r'\.[0-9]+', '', date_str)
1087 if timezone is None:
1088 timezone, date_str = extract_timezone(date_str)
1091 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1092 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1093 return calendar.timegm(dt.timetuple())
1098 def date_formats(day_first=True):
1099 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1102 def unified_strdate(date_str, day_first=True):
1103 """Return a string with the date in the format YYYYMMDD"""
1105 if date_str is None:
1109 date_str = date_str.replace(',', ' ')
1110 # Remove AM/PM + timezone
1111 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1112 _, date_str = extract_timezone(date_str)
1114 for expression in date_formats(day_first):
1116 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1119 if upload_date is None:
1120 timetuple = email.utils.parsedate_tz(date_str)
1123 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1126 if upload_date is not None:
1127 return compat_str(upload_date)
1130 def unified_timestamp(date_str, day_first=True):
1131 if date_str is None:
1134 date_str = date_str.replace(',', ' ')
1136 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1137 timezone, date_str = extract_timezone(date_str)
1139 # Remove AM/PM + timezone
1140 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1142 for expression in date_formats(day_first):
1144 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1145 return calendar.timegm(dt.timetuple())
1148 timetuple = email.utils.parsedate_tz(date_str)
1150 return calendar.timegm(timetuple) + pm_delta * 3600
1153 def determine_ext(url, default_ext='unknown_video'):
1156 guess = url.partition('?')[0].rpartition('.')[2]
1157 if re.match(r'^[A-Za-z0-9]+$', guess):
1159 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1160 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1161 return guess.rstrip('/')
1166 def subtitles_filename(filename, sub_lang, sub_format):
1167 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1170 def date_from_str(date_str):
1172 Return a datetime object from a string in the format YYYYMMDD or
1173 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1174 today = datetime.date.today()
1175 if date_str in ('now', 'today'):
1177 if date_str == 'yesterday':
1178 return today - datetime.timedelta(days=1)
1179 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1180 if match is not None:
1181 sign = match.group('sign')
1182 time = int(match.group('time'))
1185 unit = match.group('unit')
1186 # A bad approximation?
1190 elif unit == 'year':
1194 delta = datetime.timedelta(**{unit: time})
1195 return today + delta
1196 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1199 def hyphenate_date(date_str):
1201 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1202 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1203 if match is not None:
1204 return '-'.join(match.groups())
1209 class DateRange(object):
1210 """Represents a time interval between two dates"""
1212 def __init__(self, start=None, end=None):
1213 """start and end must be strings in the format accepted by date"""
1214 if start is not None:
1215 self.start = date_from_str(start)
1217 self.start = datetime.datetime.min.date()
1219 self.end = date_from_str(end)
1221 self.end = datetime.datetime.max.date()
1222 if self.start > self.end:
1223 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1227 """Returns a range that only contains the given day"""
1228 return cls(day, day)
1230 def __contains__(self, date):
1231 """Check if the date is in the range"""
1232 if not isinstance(date, datetime.date):
1233 date = date_from_str(date)
1234 return self.start <= date <= self.end
1237 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1240 def platform_name():
1241 """ Returns the platform name as a compat_str """
1242 res = platform.platform()
1243 if isinstance(res, bytes):
1244 res = res.decode(preferredencoding())
1246 assert isinstance(res, compat_str)
1250 def _windows_write_string(s, out):
1251 """ Returns True if the string was written using special methods,
1252 False if it has yet to be written out."""
1253 # Adapted from http://stackoverflow.com/a/3259271/35070
1256 import ctypes.wintypes
1264 fileno = out.fileno()
1265 except AttributeError:
1266 # If the output stream doesn't have a fileno, it's virtual
1268 except io.UnsupportedOperation:
1269 # Some strange Windows pseudo files?
1271 if fileno not in WIN_OUTPUT_IDS:
1274 GetStdHandle = ctypes.WINFUNCTYPE(
1275 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1276 (b'GetStdHandle', ctypes.windll.kernel32))
1277 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1279 WriteConsoleW = ctypes.WINFUNCTYPE(
1280 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1281 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1282 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1283 written = ctypes.wintypes.DWORD(0)
1285 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1286 FILE_TYPE_CHAR = 0x0002
1287 FILE_TYPE_REMOTE = 0x8000
1288 GetConsoleMode = ctypes.WINFUNCTYPE(
1289 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1290 ctypes.POINTER(ctypes.wintypes.DWORD))(
1291 (b'GetConsoleMode', ctypes.windll.kernel32))
1292 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1294 def not_a_console(handle):
1295 if handle == INVALID_HANDLE_VALUE or handle is None:
1297 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1298 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1300 if not_a_console(h):
1303 def next_nonbmp_pos(s):
1305 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1306 except StopIteration:
1310 count = min(next_nonbmp_pos(s), 1024)
1312 ret = WriteConsoleW(
1313 h, s, count if count else 2, ctypes.byref(written), None)
1315 raise OSError('Failed to write string')
1316 if not count: # We just wrote a non-BMP character
1317 assert written.value == 2
1320 assert written.value > 0
1321 s = s[written.value:]
1325 def write_string(s, out=None, encoding=None):
1328 assert type(s) == compat_str
1330 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1331 if _windows_write_string(s, out):
1334 if ('b' in getattr(out, 'mode', '') or
1335 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1336 byt = s.encode(encoding or preferredencoding(), 'ignore')
1338 elif hasattr(out, 'buffer'):
1339 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1340 byt = s.encode(enc, 'ignore')
1341 out.buffer.write(byt)
1347 def bytes_to_intlist(bs):
1350 if isinstance(bs[0], int): # Python 3
1353 return [ord(c) for c in bs]
1356 def intlist_to_bytes(xs):
1359 return compat_struct_pack('%dB' % len(xs), *xs)
1362 # Cross-platform file locking
1363 if sys.platform == 'win32':
1364 import ctypes.wintypes
1367 class OVERLAPPED(ctypes.Structure):
1369 ('Internal', ctypes.wintypes.LPVOID),
1370 ('InternalHigh', ctypes.wintypes.LPVOID),
1371 ('Offset', ctypes.wintypes.DWORD),
1372 ('OffsetHigh', ctypes.wintypes.DWORD),
1373 ('hEvent', ctypes.wintypes.HANDLE),
1376 kernel32 = ctypes.windll.kernel32
1377 LockFileEx = kernel32.LockFileEx
1378 LockFileEx.argtypes = [
1379 ctypes.wintypes.HANDLE, # hFile
1380 ctypes.wintypes.DWORD, # dwFlags
1381 ctypes.wintypes.DWORD, # dwReserved
1382 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1383 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1384 ctypes.POINTER(OVERLAPPED) # Overlapped
1386 LockFileEx.restype = ctypes.wintypes.BOOL
1387 UnlockFileEx = kernel32.UnlockFileEx
1388 UnlockFileEx.argtypes = [
1389 ctypes.wintypes.HANDLE, # hFile
1390 ctypes.wintypes.DWORD, # dwReserved
1391 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1392 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1393 ctypes.POINTER(OVERLAPPED) # Overlapped
1395 UnlockFileEx.restype = ctypes.wintypes.BOOL
1396 whole_low = 0xffffffff
1397 whole_high = 0x7fffffff
1399 def _lock_file(f, exclusive):
1400 overlapped = OVERLAPPED()
1401 overlapped.Offset = 0
1402 overlapped.OffsetHigh = 0
1403 overlapped.hEvent = 0
1404 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1405 handle = msvcrt.get_osfhandle(f.fileno())
1406 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1407 whole_low, whole_high, f._lock_file_overlapped_p):
1408 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1410 def _unlock_file(f):
1411 assert f._lock_file_overlapped_p
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not UnlockFileEx(handle, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1418 # Some platforms, such as Jython, is missing fcntl
1422 def _lock_file(f, exclusive):
1423 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1425 def _unlock_file(f):
1426 fcntl.flock(f, fcntl.LOCK_UN)
1428 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1430 def _lock_file(f, exclusive):
1431 raise IOError(UNSUPPORTED_MSG)
1433 def _unlock_file(f):
1434 raise IOError(UNSUPPORTED_MSG)
1437 class locked_file(object):
1438 def __init__(self, filename, mode, encoding=None):
1439 assert mode in ['r', 'a', 'w']
1440 self.f = io.open(filename, mode, encoding=encoding)
1443 def __enter__(self):
1444 exclusive = self.mode != 'r'
1446 _lock_file(self.f, exclusive)
1452 def __exit__(self, etype, value, traceback):
1454 _unlock_file(self.f)
1461 def write(self, *args):
1462 return self.f.write(*args)
1464 def read(self, *args):
1465 return self.f.read(*args)
1468 def get_filesystem_encoding():
1469 encoding = sys.getfilesystemencoding()
1470 return encoding if encoding is not None else 'utf-8'
1473 def shell_quote(args):
1475 encoding = get_filesystem_encoding()
1477 if isinstance(a, bytes):
1478 # We may get a filename encoded with 'encodeFilename'
1479 a = a.decode(encoding)
1480 quoted_args.append(pipes.quote(a))
1481 return ' '.join(quoted_args)
1484 def smuggle_url(url, data):
1485 """ Pass additional data in a URL for internal use. """
1487 url, idata = unsmuggle_url(url, {})
1489 sdata = compat_urllib_parse_urlencode(
1490 {'__youtubedl_smuggle': json.dumps(data)})
1491 return url + '#' + sdata
1494 def unsmuggle_url(smug_url, default=None):
1495 if '#__youtubedl_smuggle' not in smug_url:
1496 return smug_url, default
1497 url, _, sdata = smug_url.rpartition('#')
1498 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1499 data = json.loads(jsond)
1503 def format_bytes(bytes):
1506 if type(bytes) is str:
1507 bytes = float(bytes)
1511 exponent = int(math.log(bytes, 1024.0))
1512 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1513 converted = float(bytes) / float(1024 ** exponent)
1514 return '%.2f%s' % (converted, suffix)
1517 def lookup_unit_table(unit_table, s):
1518 units_re = '|'.join(re.escape(u) for u in unit_table)
1520 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1523 num_str = m.group('num').replace(',', '.')
1524 mult = unit_table[m.group('unit')]
1525 return int(float(num_str) * mult)
1528 def parse_filesize(s):
1532 # The lower-case forms are of course incorrect and unofficial,
1533 # but we support those too
1550 'megabytes': 1000 ** 2,
1551 'mebibytes': 1024 ** 2,
1557 'gigabytes': 1000 ** 3,
1558 'gibibytes': 1024 ** 3,
1564 'terabytes': 1000 ** 4,
1565 'tebibytes': 1024 ** 4,
1571 'petabytes': 1000 ** 5,
1572 'pebibytes': 1024 ** 5,
1578 'exabytes': 1000 ** 6,
1579 'exbibytes': 1024 ** 6,
1585 'zettabytes': 1000 ** 7,
1586 'zebibytes': 1024 ** 7,
1592 'yottabytes': 1000 ** 8,
1593 'yobibytes': 1024 ** 8,
1596 return lookup_unit_table(_UNIT_TABLE, s)
1605 if re.match(r'^[\d,.]+$', s):
1606 return str_to_int(s)
1617 return lookup_unit_table(_UNIT_TABLE, s)
1620 def month_by_name(name, lang='en'):
1621 """ Return the number of a month by (locale-independently) English name """
1623 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1626 return month_names.index(name) + 1
1631 def month_by_abbreviation(abbrev):
1632 """ Return the number of a month by (locale-independently) English
1636 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1641 def fix_xml_ampersands(xml_str):
1642 """Replace all the '&' by '&' in XML"""
1644 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1649 def setproctitle(title):
1650 assert isinstance(title, compat_str)
1652 # ctypes in Jython is not complete
1653 # http://bugs.jython.org/issue2148
1654 if sys.platform.startswith('java'):
1658 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1661 title_bytes = title.encode('utf-8')
1662 buf = ctypes.create_string_buffer(len(title_bytes))
1663 buf.value = title_bytes
1665 libc.prctl(15, buf, 0, 0, 0)
1666 except AttributeError:
1667 return # Strange libc, just skip this
1670 def remove_start(s, start):
1671 return s[len(start):] if s is not None and s.startswith(start) else s
1674 def remove_end(s, end):
1675 return s[:-len(end)] if s is not None and s.endswith(end) else s
1678 def remove_quotes(s):
1679 if s is None or len(s) < 2:
1681 for quote in ('"', "'", ):
1682 if s[0] == quote and s[-1] == quote:
1687 def url_basename(url):
1688 path = compat_urlparse.urlparse(url).path
1689 return path.strip('/').split('/')[-1]
1692 class HEADRequest(compat_urllib_request.Request):
1693 def get_method(self):
1697 class PUTRequest(compat_urllib_request.Request):
1698 def get_method(self):
1702 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1705 v = getattr(v, get_attr, None)
1711 return int(v) * invscale // scale
1716 def str_or_none(v, default=None):
1717 return default if v is None else compat_str(v)
1720 def str_to_int(int_str):
1721 """ A more relaxed version of int_or_none """
1724 int_str = re.sub(r'[,\.\+]', '', int_str)
1728 def float_or_none(v, scale=1, invscale=1, default=None):
1732 return float(v) * invscale / scale
1737 def strip_or_none(v):
1738 return None if v is None else v.strip()
1741 def parse_duration(s):
1742 if not isinstance(s, compat_basestring):
1747 days, hours, mins, secs, ms = [None] * 5
1748 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1750 days, hours, mins, secs, ms = m.groups()
1755 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1758 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1761 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1764 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1767 days, hours, mins, secs, ms = m.groups()
1769 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1771 hours, mins = m.groups()
1777 duration += float(secs)
1779 duration += float(mins) * 60
1781 duration += float(hours) * 60 * 60
1783 duration += float(days) * 24 * 60 * 60
1785 duration += float(ms)
1789 def prepend_extension(filename, ext, expected_real_ext=None):
1790 name, real_ext = os.path.splitext(filename)
1792 '{0}.{1}{2}'.format(name, ext, real_ext)
1793 if not expected_real_ext or real_ext[1:] == expected_real_ext
1794 else '{0}.{1}'.format(filename, ext))
1797 def replace_extension(filename, ext, expected_real_ext=None):
1798 name, real_ext = os.path.splitext(filename)
1799 return '{0}.{1}'.format(
1800 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1804 def check_executable(exe, args=[]):
1805 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1806 args can be a list of arguments for a short output (like -version) """
1808 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1814 def get_exe_version(exe, args=['--version'],
1815 version_re=None, unrecognized='present'):
1816 """ Returns the version of the specified executable,
1817 or False if the executable is not present """
1819 out, _ = subprocess.Popen(
1820 [encodeArgument(exe)] + args,
1821 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1824 if isinstance(out, bytes): # Python 2.x
1825 out = out.decode('ascii', 'ignore')
1826 return detect_exe_version(out, version_re, unrecognized)
1829 def detect_exe_version(output, version_re=None, unrecognized='present'):
1830 assert isinstance(output, compat_str)
1831 if version_re is None:
1832 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1833 m = re.search(version_re, output)
1840 class PagedList(object):
1842 # This is only useful for tests
1843 return len(self.getslice())
1846 class OnDemandPagedList(PagedList):
1847 def __init__(self, pagefunc, pagesize, use_cache=False):
1848 self._pagefunc = pagefunc
1849 self._pagesize = pagesize
1850 self._use_cache = use_cache
1854 def getslice(self, start=0, end=None):
1856 for pagenum in itertools.count(start // self._pagesize):
1857 firstid = pagenum * self._pagesize
1858 nextfirstid = pagenum * self._pagesize + self._pagesize
1859 if start >= nextfirstid:
1864 page_results = self._cache.get(pagenum)
1865 if page_results is None:
1866 page_results = list(self._pagefunc(pagenum))
1868 self._cache[pagenum] = page_results
1871 start % self._pagesize
1872 if firstid <= start < nextfirstid
1876 ((end - 1) % self._pagesize) + 1
1877 if (end is not None and firstid <= end <= nextfirstid)
1880 if startv != 0 or endv is not None:
1881 page_results = page_results[startv:endv]
1882 res.extend(page_results)
1884 # A little optimization - if current page is not "full", ie. does
1885 # not contain page_size videos then we can assume that this page
1886 # is the last one - there are no more ids on further pages -
1887 # i.e. no need to query again.
1888 if len(page_results) + startv < self._pagesize:
1891 # If we got the whole page, but the next page is not interesting,
1892 # break out early as well
1893 if end == nextfirstid:
1898 class InAdvancePagedList(PagedList):
1899 def __init__(self, pagefunc, pagecount, pagesize):
1900 self._pagefunc = pagefunc
1901 self._pagecount = pagecount
1902 self._pagesize = pagesize
1904 def getslice(self, start=0, end=None):
1906 start_page = start // self._pagesize
1908 self._pagecount if end is None else (end // self._pagesize + 1))
1909 skip_elems = start - start_page * self._pagesize
1910 only_more = None if end is None else end - start
1911 for pagenum in range(start_page, end_page):
1912 page = list(self._pagefunc(pagenum))
1914 page = page[skip_elems:]
1916 if only_more is not None:
1917 if len(page) < only_more:
1918 only_more -= len(page)
1920 page = page[:only_more]
1927 def uppercase_escape(s):
1928 unicode_escape = codecs.getdecoder('unicode_escape')
1930 r'\\U[0-9a-fA-F]{8}',
1931 lambda m: unicode_escape(m.group(0))[0],
1935 def lowercase_escape(s):
1936 unicode_escape = codecs.getdecoder('unicode_escape')
1938 r'\\u[0-9a-fA-F]{4}',
1939 lambda m: unicode_escape(m.group(0))[0],
1943 def escape_rfc3986(s):
1944 """Escape non-ASCII characters as suggested by RFC 3986"""
1945 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1946 s = s.encode('utf-8')
1947 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1950 def escape_url(url):
1951 """Escape URL as suggested by RFC 3986"""
1952 url_parsed = compat_urllib_parse_urlparse(url)
1953 return url_parsed._replace(
1954 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1955 path=escape_rfc3986(url_parsed.path),
1956 params=escape_rfc3986(url_parsed.params),
1957 query=escape_rfc3986(url_parsed.query),
1958 fragment=escape_rfc3986(url_parsed.fragment)
1962 def read_batch_urls(batch_fd):
1964 if not isinstance(url, compat_str):
1965 url = url.decode('utf-8', 'replace')
1966 BOM_UTF8 = '\xef\xbb\xbf'
1967 if url.startswith(BOM_UTF8):
1968 url = url[len(BOM_UTF8):]
1970 if url.startswith(('#', ';', ']')):
1974 with contextlib.closing(batch_fd) as fd:
1975 return [url for url in map(fixup, fd) if url]
1978 def urlencode_postdata(*args, **kargs):
1979 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1982 def update_url_query(url, query):
1985 parsed_url = compat_urlparse.urlparse(url)
1986 qs = compat_parse_qs(parsed_url.query)
1988 return compat_urlparse.urlunparse(parsed_url._replace(
1989 query=compat_urllib_parse_urlencode(qs, True)))
1992 def update_Request(req, url=None, data=None, headers={}, query={}):
1993 req_headers = req.headers.copy()
1994 req_headers.update(headers)
1995 req_data = data or req.data
1996 req_url = update_url_query(url or req.get_full_url(), query)
1997 req_get_method = req.get_method()
1998 if req_get_method == 'HEAD':
1999 req_type = HEADRequest
2000 elif req_get_method == 'PUT':
2001 req_type = PUTRequest
2003 req_type = compat_urllib_request.Request
2005 req_url, data=req_data, headers=req_headers,
2006 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2007 if hasattr(req, 'timeout'):
2008 new_req.timeout = req.timeout
2012 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2013 if isinstance(key_or_keys, (list, tuple)):
2014 for key in key_or_keys:
2015 if key not in d or d[key] is None or skip_false_values and not d[key]:
2019 return d.get(key_or_keys, default)
2022 def try_get(src, getter, expected_type=None):
2025 except (AttributeError, KeyError, TypeError, IndexError):
2028 if expected_type is None or isinstance(v, expected_type):
2032 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2033 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2045 TV_PARENTAL_GUIDELINES = {
2055 def parse_age_limit(s):
2057 return s if 0 <= s <= 21 else None
2058 if not isinstance(s, compat_basestring):
2060 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2062 return int(m.group('age'))
2064 return US_RATINGS[s]
2065 return TV_PARENTAL_GUIDELINES.get(s)
2068 def strip_jsonp(code):
2070 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2073 def js_to_json(code):
2076 if v in ('true', 'false', 'null'):
2078 elif v.startswith('/*') or v == ',':
2081 if v[0] in ("'", '"'):
2082 v = re.sub(r'(?s)\\.|"', lambda m: {
2087 }.get(m.group(0), m.group(0)), v[1:-1])
2090 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2091 (r'^(0+[0-7]+)\s*:?$', 8),
2094 for regex, base in INTEGER_TABLE:
2095 im = re.match(regex, v)
2097 i = int(im.group(1), base)
2098 return '"%d":' % i if v.endswith(':') else '%d' % i
2102 return re.sub(r'''(?sx)
2103 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2104 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2105 /\*.*?\*/|,(?=\s*[\]}])|
2106 [a-zA-Z_][.a-zA-Z_0-9]*|
2107 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2112 def qualities(quality_ids):
2113 """ Get a numeric quality value out of a list of possible values """
2116 return quality_ids.index(qid)
2122 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2125 def limit_length(s, length):
2126 """ Add ellipses to overly long strings """
2131 return s[:length - len(ELLIPSES)] + ELLIPSES
2135 def version_tuple(v):
2136 return tuple(int(e) for e in re.split(r'[-.]', v))
2139 def is_outdated_version(version, limit, assume_new=True):
2141 return not assume_new
2143 return version_tuple(version) < version_tuple(limit)
2145 return not assume_new
2148 def ytdl_is_updateable():
2149 """ Returns if youtube-dl can be updated with -U """
2150 from zipimport import zipimporter
2152 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2155 def args_to_str(args):
2156 # Get a short string representation for a subprocess command
2157 return ' '.join(compat_shlex_quote(a) for a in args)
2160 def error_to_compat_str(err):
2162 # On python 2 error byte string must be decoded with proper
2163 # encoding rather than ascii
2164 if sys.version_info[0] < 3:
2165 err_str = err_str.decode(preferredencoding())
2169 def mimetype2ext(mt):
2175 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2176 # it's the most popular one
2177 'audio/mpeg': 'mp3',
2182 _, _, res = mt.rpartition('/')
2183 res = res.split(';')[0].strip().lower()
2187 'smptett+xml': 'tt',
2193 'x-mp4-fragmented': 'mp4',
2196 'x-mpegurl': 'm3u8',
2197 'vnd.apple.mpegurl': 'm3u8',
2202 'vnd.ms-sstr+xml': 'ism',
2207 def parse_codecs(codecs_str):
2208 # http://tools.ietf.org/html/rfc6381
2211 splited_codecs = list(filter(None, map(
2212 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2213 vcodec, acodec = None, None
2214 for full_codec in splited_codecs:
2215 codec = full_codec.split('.')[0]
2216 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2219 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2223 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2224 if not vcodec and not acodec:
2225 if len(splited_codecs) == 2:
2230 elif len(splited_codecs) == 1:
2237 'vcodec': vcodec or 'none',
2238 'acodec': acodec or 'none',
2243 def urlhandle_detect_ext(url_handle):
2244 getheader = url_handle.headers.get
2246 cd = getheader('Content-Disposition')
2248 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2250 e = determine_ext(m.group('filename'), default_ext=None)
2254 return mimetype2ext(getheader('Content-Type'))
2257 def encode_data_uri(data, mime_type):
2258 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2261 def age_restricted(content_limit, age_limit):
2262 """ Returns True iff the content should be blocked """
2264 if age_limit is None: # No limit set
2266 if content_limit is None:
2267 return False # Content available for everyone
2268 return age_limit < content_limit
2271 def is_html(first_bytes):
2272 """ Detect whether a file contains HTML by examining its first bytes. """
2275 (b'\xef\xbb\xbf', 'utf-8'),
2276 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2277 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2278 (b'\xff\xfe', 'utf-16-le'),
2279 (b'\xfe\xff', 'utf-16-be'),
2281 for bom, enc in BOMS:
2282 if first_bytes.startswith(bom):
2283 s = first_bytes[len(bom):].decode(enc, 'replace')
2286 s = first_bytes.decode('utf-8', 'replace')
2288 return re.match(r'^\s*<', s)
2291 def determine_protocol(info_dict):
2292 protocol = info_dict.get('protocol')
2293 if protocol is not None:
2296 url = info_dict['url']
2297 if url.startswith('rtmp'):
2299 elif url.startswith('mms'):
2301 elif url.startswith('rtsp'):
2304 ext = determine_ext(url)
2310 return compat_urllib_parse_urlparse(url).scheme
2313 def render_table(header_row, data):
2314 """ Render a list of rows, each as a list of values """
2315 table = [header_row] + data
2316 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2317 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2318 return '\n'.join(format_str % tuple(row) for row in table)
2321 def _match_one(filter_part, dct):
2322 COMPARISON_OPERATORS = {
2330 operator_rex = re.compile(r'''(?x)\s*
2332 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2334 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2335 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2338 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2339 m = operator_rex.search(filter_part)
2341 op = COMPARISON_OPERATORS[m.group('op')]
2342 if m.group('strval') is not None:
2343 if m.group('op') not in ('=', '!='):
2345 'Operator %s does not support string values!' % m.group('op'))
2346 comparison_value = m.group('strval')
2349 comparison_value = int(m.group('intval'))
2351 comparison_value = parse_filesize(m.group('intval'))
2352 if comparison_value is None:
2353 comparison_value = parse_filesize(m.group('intval') + 'B')
2354 if comparison_value is None:
2356 'Invalid integer value %r in filter part %r' % (
2357 m.group('intval'), filter_part))
2358 actual_value = dct.get(m.group('key'))
2359 if actual_value is None:
2360 return m.group('none_inclusive')
2361 return op(actual_value, comparison_value)
2364 '': lambda v: v is not None,
2365 '!': lambda v: v is None,
2367 operator_rex = re.compile(r'''(?x)\s*
2368 (?P<op>%s)\s*(?P<key>[a-z_]+)
2370 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2371 m = operator_rex.search(filter_part)
2373 op = UNARY_OPERATORS[m.group('op')]
2374 actual_value = dct.get(m.group('key'))
2375 return op(actual_value)
2377 raise ValueError('Invalid filter part %r' % filter_part)
2380 def match_str(filter_str, dct):
2381 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2384 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2387 def match_filter_func(filter_str):
2388 def _match_func(info_dict):
2389 if match_str(filter_str, info_dict):
2392 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2393 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2397 def parse_dfxp_time_expr(time_expr):
2401 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2403 return float(mobj.group('time_offset'))
2405 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2407 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2410 def srt_subtitles_timecode(seconds):
2411 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2414 def dfxp2srt(dfxp_data):
2415 _x = functools.partial(xpath_with_ns, ns_map={
2416 'ttml': 'http://www.w3.org/ns/ttml',
2417 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2418 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2421 class TTMLPElementParser(object):
2424 def start(self, tag, attrib):
2425 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2431 def data(self, data):
2435 return self.out.strip()
2437 def parse_node(node):
2438 target = TTMLPElementParser()
2439 parser = xml.etree.ElementTree.XMLParser(target=target)
2440 parser.feed(xml.etree.ElementTree.tostring(node))
2441 return parser.close()
2443 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2445 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2448 raise ValueError('Invalid dfxp/TTML subtitle')
2450 for para, index in zip(paras, itertools.count(1)):
2451 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2452 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2453 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2454 if begin_time is None:
2459 end_time = begin_time + dur
2460 out.append('%d\n%s --> %s\n%s\n\n' % (
2462 srt_subtitles_timecode(begin_time),
2463 srt_subtitles_timecode(end_time),
2469 def cli_option(params, command_option, param):
2470 param = params.get(param)
2472 param = compat_str(param)
2473 return [command_option, param] if param is not None else []
2476 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2477 param = params.get(param)
2478 assert isinstance(param, bool)
2480 return [command_option + separator + (true_value if param else false_value)]
2481 return [command_option, true_value if param else false_value]
2484 def cli_valueless_option(params, command_option, param, expected_value=True):
2485 param = params.get(param)
2486 return [command_option] if param == expected_value else []
2489 def cli_configuration_args(params, param, default=[]):
2490 ex_args = params.get(param)
2493 assert isinstance(ex_args, list)
2497 class ISO639Utils(object):
2498 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2687 def short2long(cls, code):
2688 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2689 return cls._lang_map.get(code[:2])
2692 def long2short(cls, code):
2693 """Convert language code from ISO 639-2/T to ISO 639-1"""
2694 for short_name, long_name in cls._lang_map.items():
2695 if long_name == code:
2699 class ISO3166Utils(object):
2700 # From http://data.okfn.org/data/core/country-list
2702 'AF': 'Afghanistan',
2703 'AX': 'Åland Islands',
2706 'AS': 'American Samoa',
2711 'AG': 'Antigua and Barbuda',
2728 'BO': 'Bolivia, Plurinational State of',
2729 'BQ': 'Bonaire, Sint Eustatius and Saba',
2730 'BA': 'Bosnia and Herzegovina',
2732 'BV': 'Bouvet Island',
2734 'IO': 'British Indian Ocean Territory',
2735 'BN': 'Brunei Darussalam',
2737 'BF': 'Burkina Faso',
2743 'KY': 'Cayman Islands',
2744 'CF': 'Central African Republic',
2748 'CX': 'Christmas Island',
2749 'CC': 'Cocos (Keeling) Islands',
2753 'CD': 'Congo, the Democratic Republic of the',
2754 'CK': 'Cook Islands',
2756 'CI': 'Côte d\'Ivoire',
2761 'CZ': 'Czech Republic',
2765 'DO': 'Dominican Republic',
2768 'SV': 'El Salvador',
2769 'GQ': 'Equatorial Guinea',
2773 'FK': 'Falkland Islands (Malvinas)',
2774 'FO': 'Faroe Islands',
2778 'GF': 'French Guiana',
2779 'PF': 'French Polynesia',
2780 'TF': 'French Southern Territories',
2795 'GW': 'Guinea-Bissau',
2798 'HM': 'Heard Island and McDonald Islands',
2799 'VA': 'Holy See (Vatican City State)',
2806 'IR': 'Iran, Islamic Republic of',
2809 'IM': 'Isle of Man',
2819 'KP': 'Korea, Democratic People\'s Republic of',
2820 'KR': 'Korea, Republic of',
2823 'LA': 'Lao People\'s Democratic Republic',
2829 'LI': 'Liechtenstein',
2833 'MK': 'Macedonia, the Former Yugoslav Republic of',
2840 'MH': 'Marshall Islands',
2846 'FM': 'Micronesia, Federated States of',
2847 'MD': 'Moldova, Republic of',
2858 'NL': 'Netherlands',
2859 'NC': 'New Caledonia',
2860 'NZ': 'New Zealand',
2865 'NF': 'Norfolk Island',
2866 'MP': 'Northern Mariana Islands',
2871 'PS': 'Palestine, State of',
2873 'PG': 'Papua New Guinea',
2876 'PH': 'Philippines',
2880 'PR': 'Puerto Rico',
2884 'RU': 'Russian Federation',
2886 'BL': 'Saint Barthélemy',
2887 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2888 'KN': 'Saint Kitts and Nevis',
2889 'LC': 'Saint Lucia',
2890 'MF': 'Saint Martin (French part)',
2891 'PM': 'Saint Pierre and Miquelon',
2892 'VC': 'Saint Vincent and the Grenadines',
2895 'ST': 'Sao Tome and Principe',
2896 'SA': 'Saudi Arabia',
2900 'SL': 'Sierra Leone',
2902 'SX': 'Sint Maarten (Dutch part)',
2905 'SB': 'Solomon Islands',
2907 'ZA': 'South Africa',
2908 'GS': 'South Georgia and the South Sandwich Islands',
2909 'SS': 'South Sudan',
2914 'SJ': 'Svalbard and Jan Mayen',
2917 'CH': 'Switzerland',
2918 'SY': 'Syrian Arab Republic',
2919 'TW': 'Taiwan, Province of China',
2921 'TZ': 'Tanzania, United Republic of',
2923 'TL': 'Timor-Leste',
2927 'TT': 'Trinidad and Tobago',
2930 'TM': 'Turkmenistan',
2931 'TC': 'Turks and Caicos Islands',
2935 'AE': 'United Arab Emirates',
2936 'GB': 'United Kingdom',
2937 'US': 'United States',
2938 'UM': 'United States Minor Outlying Islands',
2942 'VE': 'Venezuela, Bolivarian Republic of',
2944 'VG': 'Virgin Islands, British',
2945 'VI': 'Virgin Islands, U.S.',
2946 'WF': 'Wallis and Futuna',
2947 'EH': 'Western Sahara',
2954 def short2full(cls, code):
2955 """Convert an ISO 3166-2 country code to the corresponding full name"""
2956 return cls._country_map.get(code.upper())
2959 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2960 def __init__(self, proxies=None):
2961 # Set default handlers
2962 for type in ('http', 'https'):
2963 setattr(self, '%s_open' % type,
2964 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2965 meth(r, proxy, type))
2966 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2968 def proxy_open(self, req, proxy, type):
2969 req_proxy = req.headers.get('Ytdl-request-proxy')
2970 if req_proxy is not None:
2972 del req.headers['Ytdl-request-proxy']
2974 if proxy == '__noproxy__':
2975 return None # No Proxy
2976 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2977 req.add_header('Ytdl-socks-proxy', proxy)
2978 # youtube-dl's http/https handlers do wrapping the socket with socks
2980 return compat_urllib_request.ProxyHandler.proxy_open(
2981 self, req, proxy, type)
2984 def ohdave_rsa_encrypt(data, exponent, modulus):
2986 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2989 data: data to encrypt, bytes-like object
2990 exponent, modulus: parameter e and N of RSA algorithm, both integer
2991 Output: hex string of encrypted data
2993 Limitation: supports one block encryption only
2996 payload = int(binascii.hexlify(data[::-1]), 16)
2997 encrypted = pow(payload, exponent, modulus)
2998 return '%x' % encrypted
3001 def encode_base_n(num, n, table=None):
3002 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3004 table = FULL_TABLE[:n]
3007 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3014 ret = table[num % n] + ret
3019 def decode_packed_codes(code):
3021 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
3023 obfucasted_code, base, count, symbols = mobj.groups()
3026 symbols = symbols.split('|')
3031 base_n_count = encode_base_n(count, base)
3032 symbol_table[base_n_count] = symbols[count] or base_n_count
3035 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3039 def parse_m3u8_attributes(attrib):
3041 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3042 if val.startswith('"'):
3048 def urshift(val, n):
3049 return val >> n if val >= 0 else (val + 0x100000000) >> n
3052 # Based on png2str() written by @gdkchan and improved by @yokrysty
3053 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3054 def decode_png(png_data):
3055 # Reference: https://www.w3.org/TR/PNG/
3056 header = png_data[8:]
3058 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3059 raise IOError('Not a valid PNG file.')
3061 int_map = {1: '>B', 2: '>H', 4: '>I'}
3062 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3067 length = unpack_integer(header[:4])
3070 chunk_type = header[:4]
3073 chunk_data = header[:length]
3074 header = header[length:]
3076 header = header[4:] # Skip CRC
3084 ihdr = chunks[0]['data']
3086 width = unpack_integer(ihdr[:4])
3087 height = unpack_integer(ihdr[4:8])
3091 for chunk in chunks:
3092 if chunk['type'] == b'IDAT':
3093 idat += chunk['data']
3096 raise IOError('Unable to read PNG data.')
3098 decompressed_data = bytearray(zlib.decompress(idat))
3103 def _get_pixel(idx):
3108 for y in range(height):
3109 basePos = y * (1 + stride)
3110 filter_type = decompressed_data[basePos]
3114 pixels.append(current_row)
3116 for x in range(stride):
3117 color = decompressed_data[1 + basePos + x]
3118 basex = y * stride + x
3123 left = _get_pixel(basex - 3)
3125 up = _get_pixel(basex - stride)
3127 if filter_type == 1: # Sub
3128 color = (color + left) & 0xff
3129 elif filter_type == 2: # Up
3130 color = (color + up) & 0xff
3131 elif filter_type == 3: # Average
3132 color = (color + ((left + up) >> 1)) & 0xff
3133 elif filter_type == 4: # Paeth
3139 c = _get_pixel(basex - stride - 3)
3147 if pa <= pb and pa <= pc:
3148 color = (color + a) & 0xff
3150 color = (color + b) & 0xff
3152 color = (color + c) & 0xff
3154 current_row.append(color)
3156 return width, height, pixels
3159 def write_xattr(path, key, value):
3160 # This mess below finds the best xattr tool for the job
3162 # try the pyxattr module...
3165 if hasattr(xattr, 'set'): # pyxattr
3166 # Unicode arguments are not supported in python-pyxattr until
3168 # See https://github.com/rg3/youtube-dl/issues/5498
3169 pyxattr_required_version = '0.5.0'
3170 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3171 # TODO: fallback to CLI tools
3172 raise XAttrUnavailableError(
3173 'python-pyxattr is detected but is too old. '
3174 'youtube-dl requires %s or above while your version is %s. '
3175 'Falling back to other xattr implementations' % (
3176 pyxattr_required_version, xattr.__version__))
3178 setxattr = xattr.set
3180 setxattr = xattr.setxattr
3183 setxattr(path, key, value)
3184 except EnvironmentError as e:
3185 raise XAttrMetadataError(e.errno, e.strerror)
3188 if compat_os_name == 'nt':
3189 # Write xattrs to NTFS Alternate Data Streams:
3190 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3191 assert ':' not in key
3192 assert os.path.exists(path)
3194 ads_fn = path + ':' + key
3196 with open(ads_fn, 'wb') as f:
3198 except EnvironmentError as e:
3199 raise XAttrMetadataError(e.errno, e.strerror)
3201 user_has_setfattr = check_executable('setfattr', ['--version'])
3202 user_has_xattr = check_executable('xattr', ['-h'])
3204 if user_has_setfattr or user_has_xattr:
3206 value = value.decode('utf-8')
3207 if user_has_setfattr:
3208 executable = 'setfattr'
3209 opts = ['-n', key, '-v', value]
3210 elif user_has_xattr:
3211 executable = 'xattr'
3212 opts = ['-w', key, value]
3214 cmd = ([encodeFilename(executable, True)] +
3215 [encodeArgument(o) for o in opts] +
3216 [encodeFilename(path, True)])
3219 p = subprocess.Popen(
3220 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3221 except EnvironmentError as e:
3222 raise XAttrMetadataError(e.errno, e.strerror)
3223 stdout, stderr = p.communicate()
3224 stderr = stderr.decode('utf-8', 'replace')
3225 if p.returncode != 0:
3226 raise XAttrMetadataError(p.returncode, stderr)
3229 # On Unix, and can't find pyxattr, setfattr, or xattr.
3230 if sys.platform.startswith('linux'):
3231 raise XAttrUnavailableError(
3232 "Couldn't find a tool to set the xattrs. "
3233 "Install either the python 'pyxattr' or 'xattr' "
3234 "modules, or the GNU 'attr' package "
3235 "(which contains the 'setfattr' tool).")
3237 raise XAttrUnavailableError(
3238 "Couldn't find a tool to set the xattrs. "
3239 "Install either the python 'xattr' module, "
3240 "or the 'xattr' binary.")