4 from __future__ import unicode_literals
35 import xml.etree.ElementTree
39 compat_HTMLParseError,
43 compat_etree_fromstring,
46 compat_html_entities_html5,
52 compat_socket_create_connection,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
105 'en': ENGLISH_MONTH_NAMES,
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
124 'f4f', 'f4m', 'm3u8', 'smil')
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
152 '%Y-%m-%d %H:%M:%S.%f',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
159 '%Y-%m-%dT%H:%M:%S.%f',
162 '%b %d %Y at %H:%M:%S',
165 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
166 DATE_FORMATS_DAY_FIRST.extend([
175 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
176 DATE_FORMATS_MONTH_FIRST.extend([
184 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
187 def preferredencoding():
188 """Get preferred encoding.
190 Returns the best encoding scheme for the system, based on
191 locale.getpreferredencoding() and some further tweaks.
194 pref = locale.getpreferredencoding()
202 def write_json_file(obj, fn):
203 """ Encode obj as JSON and write it to fn, atomically if possible """
205 fn = encodeFilename(fn)
206 if sys.version_info < (3, 0) and sys.platform != 'win32':
207 encoding = get_filesystem_encoding()
208 # os.path.basename returns a bytes object, but NamedTemporaryFile
209 # will fail if the filename contains non ascii characters unless we
210 # use a unicode object
211 path_basename = lambda f: os.path.basename(fn).decode(encoding)
212 # the same for os.path.dirname
213 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
215 path_basename = os.path.basename
216 path_dirname = os.path.dirname
220 'prefix': path_basename(fn) + '.',
221 'dir': path_dirname(fn),
225 # In Python 2.x, json.dump expects a bytestream.
226 # In Python 3.x, it writes to a character stream
227 if sys.version_info < (3, 0):
235 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
240 if sys.platform == 'win32':
241 # Need to remove existing file on Windows, else os.rename raises
242 # WindowsError or FileExistsError.
247 os.rename(tf.name, fn)
256 if sys.version_info >= (2, 7):
257 def find_xpath_attr(node, xpath, key, val=None):
258 """ Find the xpath xpath[@key=val] """
259 assert re.match(r'^[a-zA-Z_-]+$', key)
260 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
261 return node.find(expr)
263 def find_xpath_attr(node, xpath, key, val=None):
264 for f in node.findall(compat_xpath(xpath)):
265 if key not in f.attrib:
267 if val is None or f.attrib.get(key) == val:
271 # On python2.6 the xml.etree.ElementTree.Element methods don't support
272 # the namespace parameter
275 def xpath_with_ns(path, ns_map):
276 components = [c.split(':') for c in path.split('/')]
280 replaced.append(c[0])
283 replaced.append('{%s}%s' % (ns_map[ns], tag))
284 return '/'.join(replaced)
287 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
288 def _find_xpath(xpath):
289 return node.find(compat_xpath(xpath))
291 if isinstance(xpath, (str, compat_str)):
292 n = _find_xpath(xpath)
300 if default is not NO_DEFAULT:
303 name = xpath if name is None else name
304 raise ExtractorError('Could not find XML element %s' % name)
310 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
311 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
312 if n is None or n == default:
315 if default is not NO_DEFAULT:
318 name = xpath if name is None else name
319 raise ExtractorError('Could not find XML element\'s text %s' % name)
325 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
326 n = find_xpath_attr(node, xpath, key)
328 if default is not NO_DEFAULT:
331 name = '%s[@%s]' % (xpath, key) if name is None else name
332 raise ExtractorError('Could not find XML attribute %s' % name)
338 def get_element_by_id(id, html):
339 """Return the content of the tag with the specified ID in the passed HTML document"""
340 return get_element_by_attribute('id', id, html)
343 def get_element_by_class(class_name, html):
344 """Return the content of the first tag with the specified class in the passed HTML document"""
345 retval = get_elements_by_class(class_name, html)
346 return retval[0] if retval else None
349 def get_element_by_attribute(attribute, value, html, escape_value=True):
350 retval = get_elements_by_attribute(attribute, value, html, escape_value)
351 return retval[0] if retval else None
354 def get_elements_by_class(class_name, html):
355 """Return the content of all tags with the specified class in the passed HTML document as a list"""
356 return get_elements_by_attribute(
357 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
358 html, escape_value=False)
361 def get_elements_by_attribute(attribute, value, html, escape_value=True):
362 """Return the content of the tag with the specified attribute in the passed HTML document"""
364 value = re.escape(value) if escape_value else value
367 for m in re.finditer(r'''(?xs)
369 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
375 ''' % (re.escape(attribute), value), html):
376 res = m.group('content')
378 if res.startswith('"') or res.startswith("'"):
381 retlist.append(unescapeHTML(res))
386 class HTMLAttributeParser(compat_HTMLParser):
387 """Trivial HTML parser to gather the attributes for a single element"""
390 compat_HTMLParser.__init__(self)
392 def handle_starttag(self, tag, attrs):
393 self.attrs = dict(attrs)
396 def extract_attributes(html_element):
397 """Given a string for an HTML element such as
399 a="foo" B="bar" c="&98;az" d=boz
400 empty= noval entity="&"
403 Decode and return a dictionary of attributes.
405 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
406 'empty': '', 'noval': None, 'entity': '&',
407 'sq': '"', 'dq': '\''
409 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
410 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
412 parser = HTMLAttributeParser()
414 parser.feed(html_element)
416 # Older Python may throw HTMLParseError in case of malformed HTML
417 except compat_HTMLParseError:
422 def clean_html(html):
423 """Clean an HTML snippet into a readable string"""
425 if html is None: # Convenience for sanitizing descriptions etc.
429 html = html.replace('\n', ' ')
430 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
431 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
433 html = re.sub('<.*?>', '', html)
434 # Replace html entities
435 html = unescapeHTML(html)
439 def sanitize_open(filename, open_mode):
440 """Try to open the given filename, and slightly tweak it if this fails.
442 Attempts to open the given filename. If this fails, it tries to change
443 the filename slightly, step by step, until it's either able to open it
444 or it fails and raises a final exception, like the standard open()
447 It returns the tuple (stream, definitive_file_name).
451 if sys.platform == 'win32':
453 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
454 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
455 stream = open(encodeFilename(filename), open_mode)
456 return (stream, filename)
457 except (IOError, OSError) as err:
458 if err.errno in (errno.EACCES,):
461 # In case of error, try to remove win32 forbidden chars
462 alt_filename = sanitize_path(filename)
463 if alt_filename == filename:
466 # An exception here should be caught in the caller
467 stream = open(encodeFilename(alt_filename), open_mode)
468 return (stream, alt_filename)
471 def timeconvert(timestr):
472 """Convert RFC 2822 defined time string into system timestamp"""
474 timetuple = email.utils.parsedate_tz(timestr)
475 if timetuple is not None:
476 timestamp = email.utils.mktime_tz(timetuple)
480 def sanitize_filename(s, restricted=False, is_id=False):
481 """Sanitizes a string so it could be used as part of a filename.
482 If restricted is set, use a stricter subset of allowed characters.
483 Set is_id if this is not an arbitrary string, but an ID that should be kept
486 def replace_insane(char):
487 if restricted and char in ACCENT_CHARS:
488 return ACCENT_CHARS[char]
489 if char == '?' or ord(char) < 32 or ord(char) == 127:
492 return '' if restricted else '\''
494 return '_-' if restricted else ' -'
495 elif char in '\\/|*<>':
497 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
499 if restricted and ord(char) > 127:
504 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
505 result = ''.join(map(replace_insane, s))
507 while '__' in result:
508 result = result.replace('__', '_')
509 result = result.strip('_')
510 # Common case of "Foreign band name - English song title"
511 if restricted and result.startswith('-_'):
513 if result.startswith('-'):
514 result = '_' + result[len('-'):]
515 result = result.lstrip('.')
521 def sanitize_path(s):
522 """Sanitizes and normalizes path on Windows"""
523 if sys.platform != 'win32':
525 drive_or_unc, _ = os.path.splitdrive(s)
526 if sys.version_info < (2, 7) and not drive_or_unc:
527 drive_or_unc, _ = os.path.splitunc(s)
528 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
532 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
533 for path_part in norm_path]
535 sanitized_path.insert(0, drive_or_unc + os.path.sep)
536 return os.path.join(*sanitized_path)
539 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
540 # unwanted failures due to missing protocol
541 def sanitize_url(url):
542 return 'http:%s' % url if url.startswith('//') else url
545 def sanitized_Request(url, *args, **kwargs):
546 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
550 """Expand shell variables and ~"""
551 return os.path.expandvars(compat_expanduser(s))
554 def orderedSet(iterable):
555 """ Remove all duplicates from the input iterable """
563 def _htmlentity_transform(entity_with_semicolon):
564 """Transforms an HTML entity to a character."""
565 entity = entity_with_semicolon[:-1]
567 # Known non-numeric HTML entity
568 if entity in compat_html_entities.name2codepoint:
569 return compat_chr(compat_html_entities.name2codepoint[entity])
571 # TODO: HTML5 allows entities without a semicolon. For example,
572 # 'Éric' should be decoded as 'Éric'.
573 if entity_with_semicolon in compat_html_entities_html5:
574 return compat_html_entities_html5[entity_with_semicolon]
576 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
578 numstr = mobj.group(1)
579 if numstr.startswith('x'):
581 numstr = '0%s' % numstr
584 # See https://github.com/rg3/youtube-dl/issues/7518
586 return compat_chr(int(numstr, base))
590 # Unknown entity in name, return its literal representation
591 return '&%s;' % entity
597 assert type(s) == compat_str
600 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
603 def get_subprocess_encoding():
604 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
605 # For subprocess calls, encode with locale encoding
606 # Refer to http://stackoverflow.com/a/9951851/35070
607 encoding = preferredencoding()
609 encoding = sys.getfilesystemencoding()
615 def encodeFilename(s, for_subprocess=False):
617 @param s The name of the file
620 assert type(s) == compat_str
622 # Python 3 has a Unicode API
623 if sys.version_info >= (3, 0):
626 # Pass '' directly to use Unicode APIs on Windows 2000 and up
627 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
628 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
629 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
632 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
633 if sys.platform.startswith('java'):
636 return s.encode(get_subprocess_encoding(), 'ignore')
639 def decodeFilename(b, for_subprocess=False):
641 if sys.version_info >= (3, 0):
644 if not isinstance(b, bytes):
647 return b.decode(get_subprocess_encoding(), 'ignore')
650 def encodeArgument(s):
651 if not isinstance(s, compat_str):
652 # Legacy code that uses byte strings
653 # Uncomment the following line after fixing all post processors
654 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
655 s = s.decode('ascii')
656 return encodeFilename(s, True)
659 def decodeArgument(b):
660 return decodeFilename(b, True)
663 def decodeOption(optval):
666 if isinstance(optval, bytes):
667 optval = optval.decode(preferredencoding())
669 assert isinstance(optval, compat_str)
673 def formatSeconds(secs):
675 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
677 return '%d:%02d' % (secs // 60, secs % 60)
682 def make_HTTPS_handler(params, **kwargs):
683 opts_no_check_certificate = params.get('nocheckcertificate', False)
684 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
685 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
686 if opts_no_check_certificate:
687 context.check_hostname = False
688 context.verify_mode = ssl.CERT_NONE
690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
693 # (create_default_context present but HTTPSHandler has no context=)
696 if sys.version_info < (3, 2):
697 return YoutubeDLHTTPSHandler(params, **kwargs)
699 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
700 context.verify_mode = (ssl.CERT_NONE
701 if opts_no_check_certificate
702 else ssl.CERT_REQUIRED)
703 context.set_default_verify_paths()
704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
707 def bug_reports_message():
708 if ytdl_is_updateable():
709 update_cmd = 'type youtube-dl -U to update'
711 update_cmd = 'see https://yt-dl.org/update on how to update'
712 msg = '; please report this issue on https://yt-dl.org/bug .'
713 msg += ' Make sure you are using the latest version; %s.' % update_cmd
714 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
718 class YoutubeDLError(Exception):
719 """Base exception for YoutubeDL errors."""
723 class ExtractorError(YoutubeDLError):
724 """Error during info extraction."""
726 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
727 """ tb, if given, is the original traceback (so that it can be printed out).
728 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
731 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
733 if video_id is not None:
734 msg = video_id + ': ' + msg
736 msg += ' (caused by %r)' % cause
738 msg += bug_reports_message()
739 super(ExtractorError, self).__init__(msg)
742 self.exc_info = sys.exc_info() # preserve original exception
744 self.video_id = video_id
746 def format_traceback(self):
747 if self.traceback is None:
749 return ''.join(traceback.format_tb(self.traceback))
752 class UnsupportedError(ExtractorError):
753 def __init__(self, url):
754 super(UnsupportedError, self).__init__(
755 'Unsupported URL: %s' % url, expected=True)
759 class RegexNotFoundError(ExtractorError):
760 """Error when a regex didn't match"""
764 class GeoRestrictedError(ExtractorError):
765 """Geographic restriction Error exception.
767 This exception may be thrown when a video is not available from your
768 geographic location due to geographic restrictions imposed by a website.
770 def __init__(self, msg, countries=None):
771 super(GeoRestrictedError, self).__init__(msg, expected=True)
773 self.countries = countries
776 class DownloadError(YoutubeDLError):
777 """Download Error exception.
779 This exception may be thrown by FileDownloader objects if they are not
780 configured to continue on errors. They will contain the appropriate
784 def __init__(self, msg, exc_info=None):
785 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
786 super(DownloadError, self).__init__(msg)
787 self.exc_info = exc_info
790 class SameFileError(YoutubeDLError):
791 """Same File exception.
793 This exception will be thrown by FileDownloader objects if they detect
794 multiple files would have to be downloaded to the same file on disk.
799 class PostProcessingError(YoutubeDLError):
800 """Post Processing exception.
802 This exception may be raised by PostProcessor's .run() method to
803 indicate an error in the postprocessing task.
806 def __init__(self, msg):
807 super(PostProcessingError, self).__init__(msg)
811 class MaxDownloadsReached(YoutubeDLError):
812 """ --max-downloads limit has been reached. """
816 class UnavailableVideoError(YoutubeDLError):
817 """Unavailable Format exception.
819 This exception will be thrown when a video is requested
820 in a format that is not available for that video.
825 class ContentTooShortError(YoutubeDLError):
826 """Content Too Short exception.
828 This exception may be raised by FileDownloader objects when a file they
829 download is too small for what the server announced first, indicating
830 the connection was probably interrupted.
833 def __init__(self, downloaded, expected):
834 super(ContentTooShortError, self).__init__(
835 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
838 self.downloaded = downloaded
839 self.expected = expected
842 class XAttrMetadataError(YoutubeDLError):
843 def __init__(self, code=None, msg='Unknown error'):
844 super(XAttrMetadataError, self).__init__(msg)
848 # Parsing code and msg
849 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
850 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
851 self.reason = 'NO_SPACE'
852 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
853 self.reason = 'VALUE_TOO_LONG'
855 self.reason = 'NOT_SUPPORTED'
858 class XAttrUnavailableError(YoutubeDLError):
862 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
863 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
864 # expected HTTP responses to meet HTTP/1.0 or later (see also
865 # https://github.com/rg3/youtube-dl/issues/6727)
866 if sys.version_info < (3, 0):
867 kwargs[b'strict'] = True
868 hc = http_class(*args, **kwargs)
869 source_address = ydl_handler._params.get('source_address')
870 if source_address is not None:
871 sa = (source_address, 0)
872 if hasattr(hc, 'source_address'): # Python 2.7+
873 hc.source_address = sa
875 def _hc_connect(self, *args, **kwargs):
876 sock = compat_socket_create_connection(
877 (self.host, self.port), self.timeout, sa)
879 self.sock = ssl.wrap_socket(
880 sock, self.key_file, self.cert_file,
881 ssl_version=ssl.PROTOCOL_TLSv1)
884 hc.connect = functools.partial(_hc_connect, hc)
889 def handle_youtubedl_headers(headers):
890 filtered_headers = headers
892 if 'Youtubedl-no-compression' in filtered_headers:
893 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
894 del filtered_headers['Youtubedl-no-compression']
896 return filtered_headers
899 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
900 """Handler for HTTP requests and responses.
902 This class, when installed with an OpenerDirector, automatically adds
903 the standard headers to every HTTP request and handles gzipped and
904 deflated responses from web servers. If compression is to be avoided in
905 a particular request, the original request in the program code only has
906 to include the HTTP header "Youtubedl-no-compression", which will be
907 removed before making the real request.
909 Part of this code was copied from:
911 http://techknack.net/python-urllib2-handlers/
913 Andrew Rowls, the author of that code, agreed to release it to the
917 def __init__(self, params, *args, **kwargs):
918 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
919 self._params = params
921 def http_open(self, req):
922 conn_class = compat_http_client.HTTPConnection
924 socks_proxy = req.headers.get('Ytdl-socks-proxy')
926 conn_class = make_socks_conn_class(conn_class, socks_proxy)
927 del req.headers['Ytdl-socks-proxy']
929 return self.do_open(functools.partial(
930 _create_http_connection, self, conn_class, False),
936 return zlib.decompress(data, -zlib.MAX_WBITS)
938 return zlib.decompress(data)
940 def http_request(self, req):
941 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
942 # always respected by websites, some tend to give out URLs with non percent-encoded
943 # non-ASCII characters (see telemb.py, ard.py [#3412])
944 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
945 # To work around aforementioned issue we will replace request's original URL with
946 # percent-encoded one
947 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
948 # the code of this workaround has been moved here from YoutubeDL.urlopen()
949 url = req.get_full_url()
950 url_escaped = escape_url(url)
952 # Substitute URL if any change after escaping
953 if url != url_escaped:
954 req = update_Request(req, url=url_escaped)
956 for h, v in std_headers.items():
957 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
958 # The dict keys are capitalized because of this bug by urllib
959 if h.capitalize() not in req.headers:
962 req.headers = handle_youtubedl_headers(req.headers)
964 if sys.version_info < (2, 7) and '#' in req.get_full_url():
965 # Python 2.6 is brain-dead when it comes to fragments
966 req._Request__original = req._Request__original.partition('#')[0]
967 req._Request__r_type = req._Request__r_type.partition('#')[0]
971 def http_response(self, req, resp):
974 if resp.headers.get('Content-encoding', '') == 'gzip':
975 content = resp.read()
976 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
978 uncompressed = io.BytesIO(gz.read())
979 except IOError as original_ioerror:
980 # There may be junk add the end of the file
981 # See http://stackoverflow.com/q/4928560/35070 for details
982 for i in range(1, 1024):
984 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
985 uncompressed = io.BytesIO(gz.read())
990 raise original_ioerror
991 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
992 resp.msg = old_resp.msg
993 del resp.headers['Content-encoding']
995 if resp.headers.get('Content-encoding', '') == 'deflate':
996 gz = io.BytesIO(self.deflate(resp.read()))
997 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
998 resp.msg = old_resp.msg
999 del resp.headers['Content-encoding']
1000 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1001 # https://github.com/rg3/youtube-dl/issues/6457).
1002 if 300 <= resp.code < 400:
1003 location = resp.headers.get('Location')
1005 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1006 if sys.version_info >= (3, 0):
1007 location = location.encode('iso-8859-1').decode('utf-8')
1009 location = location.decode('utf-8')
1010 location_escaped = escape_url(location)
1011 if location != location_escaped:
1012 del resp.headers['Location']
1013 if sys.version_info < (3, 0):
1014 location_escaped = location_escaped.encode('utf-8')
1015 resp.headers['Location'] = location_escaped
1018 https_request = http_request
1019 https_response = http_response
1022 def make_socks_conn_class(base_class, socks_proxy):
1023 assert issubclass(base_class, (
1024 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1026 url_components = compat_urlparse.urlparse(socks_proxy)
1027 if url_components.scheme.lower() == 'socks5':
1028 socks_type = ProxyType.SOCKS5
1029 elif url_components.scheme.lower() in ('socks', 'socks4'):
1030 socks_type = ProxyType.SOCKS4
1031 elif url_components.scheme.lower() == 'socks4a':
1032 socks_type = ProxyType.SOCKS4A
1034 def unquote_if_non_empty(s):
1037 return compat_urllib_parse_unquote_plus(s)
1041 url_components.hostname, url_components.port or 1080,
1043 unquote_if_non_empty(url_components.username),
1044 unquote_if_non_empty(url_components.password),
1047 class SocksConnection(base_class):
1049 self.sock = sockssocket()
1050 self.sock.setproxy(*proxy_args)
1051 if type(self.timeout) in (int, float):
1052 self.sock.settimeout(self.timeout)
1053 self.sock.connect((self.host, self.port))
1055 if isinstance(self, compat_http_client.HTTPSConnection):
1056 if hasattr(self, '_context'): # Python > 2.6
1057 self.sock = self._context.wrap_socket(
1058 self.sock, server_hostname=self.host)
1060 self.sock = ssl.wrap_socket(self.sock)
1062 return SocksConnection
1065 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1066 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1067 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1068 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1069 self._params = params
1071 def https_open(self, req):
1073 conn_class = self._https_conn_class
1075 if hasattr(self, '_context'): # python > 2.6
1076 kwargs['context'] = self._context
1077 if hasattr(self, '_check_hostname'): # python 3.x
1078 kwargs['check_hostname'] = self._check_hostname
1080 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1082 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1083 del req.headers['Ytdl-socks-proxy']
1085 return self.do_open(functools.partial(
1086 _create_http_connection, self, conn_class, True),
1090 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1091 def __init__(self, cookiejar=None):
1092 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1094 def http_response(self, request, response):
1095 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1096 # characters in Set-Cookie HTTP header of last response (see
1097 # https://github.com/rg3/youtube-dl/issues/6769).
1098 # In order to at least prevent crashing we will percent encode Set-Cookie
1099 # header before HTTPCookieProcessor starts processing it.
1100 # if sys.version_info < (3, 0) and response.headers:
1101 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1102 # set_cookie = response.headers.get(set_cookie_header)
1104 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1105 # if set_cookie != set_cookie_escaped:
1106 # del response.headers[set_cookie_header]
1107 # response.headers[set_cookie_header] = set_cookie_escaped
1108 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1110 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1111 https_response = http_response
1114 def extract_timezone(date_str):
1116 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1119 timezone = datetime.timedelta()
1121 date_str = date_str[:-len(m.group('tz'))]
1122 if not m.group('sign'):
1123 timezone = datetime.timedelta()
1125 sign = 1 if m.group('sign') == '+' else -1
1126 timezone = datetime.timedelta(
1127 hours=sign * int(m.group('hours')),
1128 minutes=sign * int(m.group('minutes')))
1129 return timezone, date_str
1132 def parse_iso8601(date_str, delimiter='T', timezone=None):
1133 """ Return a UNIX timestamp from the given date """
1135 if date_str is None:
1138 date_str = re.sub(r'\.[0-9]+', '', date_str)
1140 if timezone is None:
1141 timezone, date_str = extract_timezone(date_str)
1144 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1145 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1146 return calendar.timegm(dt.timetuple())
1151 def date_formats(day_first=True):
1152 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1155 def unified_strdate(date_str, day_first=True):
1156 """Return a string with the date in the format YYYYMMDD"""
1158 if date_str is None:
1162 date_str = date_str.replace(',', ' ')
1163 # Remove AM/PM + timezone
1164 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1165 _, date_str = extract_timezone(date_str)
1167 for expression in date_formats(day_first):
1169 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1172 if upload_date is None:
1173 timetuple = email.utils.parsedate_tz(date_str)
1176 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1179 if upload_date is not None:
1180 return compat_str(upload_date)
1183 def unified_timestamp(date_str, day_first=True):
1184 if date_str is None:
1187 date_str = re.sub(r'[,|]', '', date_str)
1189 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1190 timezone, date_str = extract_timezone(date_str)
1192 # Remove AM/PM + timezone
1193 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1195 # Remove unrecognized timezones from ISO 8601 alike timestamps
1196 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1198 date_str = date_str[:-len(m.group('tz'))]
1200 for expression in date_formats(day_first):
1202 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1203 return calendar.timegm(dt.timetuple())
1206 timetuple = email.utils.parsedate_tz(date_str)
1208 return calendar.timegm(timetuple) + pm_delta * 3600
1211 def determine_ext(url, default_ext='unknown_video'):
1214 guess = url.partition('?')[0].rpartition('.')[2]
1215 if re.match(r'^[A-Za-z0-9]+$', guess):
1217 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1218 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1219 return guess.rstrip('/')
1224 def subtitles_filename(filename, sub_lang, sub_format):
1225 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1228 def date_from_str(date_str):
1230 Return a datetime object from a string in the format YYYYMMDD or
1231 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1232 today = datetime.date.today()
1233 if date_str in ('now', 'today'):
1235 if date_str == 'yesterday':
1236 return today - datetime.timedelta(days=1)
1237 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1238 if match is not None:
1239 sign = match.group('sign')
1240 time = int(match.group('time'))
1243 unit = match.group('unit')
1244 # A bad approximation?
1248 elif unit == 'year':
1252 delta = datetime.timedelta(**{unit: time})
1253 return today + delta
1254 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1257 def hyphenate_date(date_str):
1259 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1260 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1261 if match is not None:
1262 return '-'.join(match.groups())
1267 class DateRange(object):
1268 """Represents a time interval between two dates"""
1270 def __init__(self, start=None, end=None):
1271 """start and end must be strings in the format accepted by date"""
1272 if start is not None:
1273 self.start = date_from_str(start)
1275 self.start = datetime.datetime.min.date()
1277 self.end = date_from_str(end)
1279 self.end = datetime.datetime.max.date()
1280 if self.start > self.end:
1281 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1285 """Returns a range that only contains the given day"""
1286 return cls(day, day)
1288 def __contains__(self, date):
1289 """Check if the date is in the range"""
1290 if not isinstance(date, datetime.date):
1291 date = date_from_str(date)
1292 return self.start <= date <= self.end
1295 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1298 def platform_name():
1299 """ Returns the platform name as a compat_str """
1300 res = platform.platform()
1301 if isinstance(res, bytes):
1302 res = res.decode(preferredencoding())
1304 assert isinstance(res, compat_str)
1308 def _windows_write_string(s, out):
1309 """ Returns True if the string was written using special methods,
1310 False if it has yet to be written out."""
1311 # Adapted from http://stackoverflow.com/a/3259271/35070
1314 import ctypes.wintypes
1322 fileno = out.fileno()
1323 except AttributeError:
1324 # If the output stream doesn't have a fileno, it's virtual
1326 except io.UnsupportedOperation:
1327 # Some strange Windows pseudo files?
1329 if fileno not in WIN_OUTPUT_IDS:
1332 GetStdHandle = ctypes.WINFUNCTYPE(
1333 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1334 (b'GetStdHandle', ctypes.windll.kernel32))
1335 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1337 WriteConsoleW = ctypes.WINFUNCTYPE(
1338 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1339 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1340 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1341 written = ctypes.wintypes.DWORD(0)
1343 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1344 FILE_TYPE_CHAR = 0x0002
1345 FILE_TYPE_REMOTE = 0x8000
1346 GetConsoleMode = ctypes.WINFUNCTYPE(
1347 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1348 ctypes.POINTER(ctypes.wintypes.DWORD))(
1349 (b'GetConsoleMode', ctypes.windll.kernel32))
1350 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1352 def not_a_console(handle):
1353 if handle == INVALID_HANDLE_VALUE or handle is None:
1355 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1356 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1358 if not_a_console(h):
1361 def next_nonbmp_pos(s):
1363 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1364 except StopIteration:
1368 count = min(next_nonbmp_pos(s), 1024)
1370 ret = WriteConsoleW(
1371 h, s, count if count else 2, ctypes.byref(written), None)
1373 raise OSError('Failed to write string')
1374 if not count: # We just wrote a non-BMP character
1375 assert written.value == 2
1378 assert written.value > 0
1379 s = s[written.value:]
1383 def write_string(s, out=None, encoding=None):
1386 assert type(s) == compat_str
1388 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1389 if _windows_write_string(s, out):
1392 if ('b' in getattr(out, 'mode', '') or
1393 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1394 byt = s.encode(encoding or preferredencoding(), 'ignore')
1396 elif hasattr(out, 'buffer'):
1397 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1398 byt = s.encode(enc, 'ignore')
1399 out.buffer.write(byt)
1405 def bytes_to_intlist(bs):
1408 if isinstance(bs[0], int): # Python 3
1411 return [ord(c) for c in bs]
1414 def intlist_to_bytes(xs):
1417 return compat_struct_pack('%dB' % len(xs), *xs)
1420 # Cross-platform file locking
1421 if sys.platform == 'win32':
1422 import ctypes.wintypes
1425 class OVERLAPPED(ctypes.Structure):
1427 ('Internal', ctypes.wintypes.LPVOID),
1428 ('InternalHigh', ctypes.wintypes.LPVOID),
1429 ('Offset', ctypes.wintypes.DWORD),
1430 ('OffsetHigh', ctypes.wintypes.DWORD),
1431 ('hEvent', ctypes.wintypes.HANDLE),
1434 kernel32 = ctypes.windll.kernel32
1435 LockFileEx = kernel32.LockFileEx
1436 LockFileEx.argtypes = [
1437 ctypes.wintypes.HANDLE, # hFile
1438 ctypes.wintypes.DWORD, # dwFlags
1439 ctypes.wintypes.DWORD, # dwReserved
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1441 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1442 ctypes.POINTER(OVERLAPPED) # Overlapped
1444 LockFileEx.restype = ctypes.wintypes.BOOL
1445 UnlockFileEx = kernel32.UnlockFileEx
1446 UnlockFileEx.argtypes = [
1447 ctypes.wintypes.HANDLE, # hFile
1448 ctypes.wintypes.DWORD, # dwReserved
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1450 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1451 ctypes.POINTER(OVERLAPPED) # Overlapped
1453 UnlockFileEx.restype = ctypes.wintypes.BOOL
1454 whole_low = 0xffffffff
1455 whole_high = 0x7fffffff
1457 def _lock_file(f, exclusive):
1458 overlapped = OVERLAPPED()
1459 overlapped.Offset = 0
1460 overlapped.OffsetHigh = 0
1461 overlapped.hEvent = 0
1462 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1463 handle = msvcrt.get_osfhandle(f.fileno())
1464 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1465 whole_low, whole_high, f._lock_file_overlapped_p):
1466 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1468 def _unlock_file(f):
1469 assert f._lock_file_overlapped_p
1470 handle = msvcrt.get_osfhandle(f.fileno())
1471 if not UnlockFileEx(handle, 0,
1472 whole_low, whole_high, f._lock_file_overlapped_p):
1473 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1476 # Some platforms, such as Jython, is missing fcntl
1480 def _lock_file(f, exclusive):
1481 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1483 def _unlock_file(f):
1484 fcntl.flock(f, fcntl.LOCK_UN)
1486 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1488 def _lock_file(f, exclusive):
1489 raise IOError(UNSUPPORTED_MSG)
1491 def _unlock_file(f):
1492 raise IOError(UNSUPPORTED_MSG)
1495 class locked_file(object):
1496 def __init__(self, filename, mode, encoding=None):
1497 assert mode in ['r', 'a', 'w']
1498 self.f = io.open(filename, mode, encoding=encoding)
1501 def __enter__(self):
1502 exclusive = self.mode != 'r'
1504 _lock_file(self.f, exclusive)
1510 def __exit__(self, etype, value, traceback):
1512 _unlock_file(self.f)
1519 def write(self, *args):
1520 return self.f.write(*args)
1522 def read(self, *args):
1523 return self.f.read(*args)
1526 def get_filesystem_encoding():
1527 encoding = sys.getfilesystemencoding()
1528 return encoding if encoding is not None else 'utf-8'
1531 def shell_quote(args):
1533 encoding = get_filesystem_encoding()
1535 if isinstance(a, bytes):
1536 # We may get a filename encoded with 'encodeFilename'
1537 a = a.decode(encoding)
1538 quoted_args.append(pipes.quote(a))
1539 return ' '.join(quoted_args)
1542 def smuggle_url(url, data):
1543 """ Pass additional data in a URL for internal use. """
1545 url, idata = unsmuggle_url(url, {})
1547 sdata = compat_urllib_parse_urlencode(
1548 {'__youtubedl_smuggle': json.dumps(data)})
1549 return url + '#' + sdata
1552 def unsmuggle_url(smug_url, default=None):
1553 if '#__youtubedl_smuggle' not in smug_url:
1554 return smug_url, default
1555 url, _, sdata = smug_url.rpartition('#')
1556 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1557 data = json.loads(jsond)
1561 def format_bytes(bytes):
1564 if type(bytes) is str:
1565 bytes = float(bytes)
1569 exponent = int(math.log(bytes, 1024.0))
1570 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1571 converted = float(bytes) / float(1024 ** exponent)
1572 return '%.2f%s' % (converted, suffix)
1575 def lookup_unit_table(unit_table, s):
1576 units_re = '|'.join(re.escape(u) for u in unit_table)
1578 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1581 num_str = m.group('num').replace(',', '.')
1582 mult = unit_table[m.group('unit')]
1583 return int(float(num_str) * mult)
1586 def parse_filesize(s):
1590 # The lower-case forms are of course incorrect and unofficial,
1591 # but we support those too
1608 'megabytes': 1000 ** 2,
1609 'mebibytes': 1024 ** 2,
1615 'gigabytes': 1000 ** 3,
1616 'gibibytes': 1024 ** 3,
1622 'terabytes': 1000 ** 4,
1623 'tebibytes': 1024 ** 4,
1629 'petabytes': 1000 ** 5,
1630 'pebibytes': 1024 ** 5,
1636 'exabytes': 1000 ** 6,
1637 'exbibytes': 1024 ** 6,
1643 'zettabytes': 1000 ** 7,
1644 'zebibytes': 1024 ** 7,
1650 'yottabytes': 1000 ** 8,
1651 'yobibytes': 1024 ** 8,
1654 return lookup_unit_table(_UNIT_TABLE, s)
1663 if re.match(r'^[\d,.]+$', s):
1664 return str_to_int(s)
1675 return lookup_unit_table(_UNIT_TABLE, s)
1678 def month_by_name(name, lang='en'):
1679 """ Return the number of a month by (locale-independently) English name """
1681 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1684 return month_names.index(name) + 1
1689 def month_by_abbreviation(abbrev):
1690 """ Return the number of a month by (locale-independently) English
1694 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1699 def fix_xml_ampersands(xml_str):
1700 """Replace all the '&' by '&' in XML"""
1702 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1707 def setproctitle(title):
1708 assert isinstance(title, compat_str)
1710 # ctypes in Jython is not complete
1711 # http://bugs.jython.org/issue2148
1712 if sys.platform.startswith('java'):
1716 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1720 # LoadLibrary in Windows Python 2.7.13 only expects
1721 # a bytestring, but since unicode_literals turns
1722 # every string into a unicode string, it fails.
1724 title_bytes = title.encode('utf-8')
1725 buf = ctypes.create_string_buffer(len(title_bytes))
1726 buf.value = title_bytes
1728 libc.prctl(15, buf, 0, 0, 0)
1729 except AttributeError:
1730 return # Strange libc, just skip this
1733 def remove_start(s, start):
1734 return s[len(start):] if s is not None and s.startswith(start) else s
1737 def remove_end(s, end):
1738 return s[:-len(end)] if s is not None and s.endswith(end) else s
1741 def remove_quotes(s):
1742 if s is None or len(s) < 2:
1744 for quote in ('"', "'", ):
1745 if s[0] == quote and s[-1] == quote:
1750 def url_basename(url):
1751 path = compat_urlparse.urlparse(url).path
1752 return path.strip('/').split('/')[-1]
1756 return re.match(r'https?://[^?#&]+/', url).group()
1759 def urljoin(base, path):
1760 if isinstance(path, bytes):
1761 path = path.decode('utf-8')
1762 if not isinstance(path, compat_str) or not path:
1764 if re.match(r'^(?:https?:)?//', path):
1766 if isinstance(base, bytes):
1767 base = base.decode('utf-8')
1768 if not isinstance(base, compat_str) or not re.match(
1769 r'^(?:https?:)?//', base):
1771 return compat_urlparse.urljoin(base, path)
1774 class HEADRequest(compat_urllib_request.Request):
1775 def get_method(self):
1779 class PUTRequest(compat_urllib_request.Request):
1780 def get_method(self):
1784 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1787 v = getattr(v, get_attr, None)
1793 return int(v) * invscale // scale
1798 def str_or_none(v, default=None):
1799 return default if v is None else compat_str(v)
1802 def str_to_int(int_str):
1803 """ A more relaxed version of int_or_none """
1806 int_str = re.sub(r'[,\.\+]', '', int_str)
1810 def float_or_none(v, scale=1, invscale=1, default=None):
1814 return float(v) * invscale / scale
1819 def strip_or_none(v):
1820 return None if v is None else v.strip()
1823 def parse_duration(s):
1824 if not isinstance(s, compat_basestring):
1829 days, hours, mins, secs, ms = [None] * 5
1830 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1832 days, hours, mins, secs, ms = m.groups()
1837 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1840 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1843 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1846 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1849 days, hours, mins, secs, ms = m.groups()
1851 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1853 hours, mins = m.groups()
1859 duration += float(secs)
1861 duration += float(mins) * 60
1863 duration += float(hours) * 60 * 60
1865 duration += float(days) * 24 * 60 * 60
1867 duration += float(ms)
1871 def prepend_extension(filename, ext, expected_real_ext=None):
1872 name, real_ext = os.path.splitext(filename)
1874 '{0}.{1}{2}'.format(name, ext, real_ext)
1875 if not expected_real_ext or real_ext[1:] == expected_real_ext
1876 else '{0}.{1}'.format(filename, ext))
1879 def replace_extension(filename, ext, expected_real_ext=None):
1880 name, real_ext = os.path.splitext(filename)
1881 return '{0}.{1}'.format(
1882 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1886 def check_executable(exe, args=[]):
1887 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1888 args can be a list of arguments for a short output (like -version) """
1890 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1896 def get_exe_version(exe, args=['--version'],
1897 version_re=None, unrecognized='present'):
1898 """ Returns the version of the specified executable,
1899 or False if the executable is not present """
1901 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1902 # SIGTTOU if youtube-dl is run in the background.
1903 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1904 out, _ = subprocess.Popen(
1905 [encodeArgument(exe)] + args,
1906 stdin=subprocess.PIPE,
1907 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1910 if isinstance(out, bytes): # Python 2.x
1911 out = out.decode('ascii', 'ignore')
1912 return detect_exe_version(out, version_re, unrecognized)
1915 def detect_exe_version(output, version_re=None, unrecognized='present'):
1916 assert isinstance(output, compat_str)
1917 if version_re is None:
1918 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1919 m = re.search(version_re, output)
1926 class PagedList(object):
1928 # This is only useful for tests
1929 return len(self.getslice())
1932 class OnDemandPagedList(PagedList):
1933 def __init__(self, pagefunc, pagesize, use_cache=False):
1934 self._pagefunc = pagefunc
1935 self._pagesize = pagesize
1936 self._use_cache = use_cache
1940 def getslice(self, start=0, end=None):
1942 for pagenum in itertools.count(start // self._pagesize):
1943 firstid = pagenum * self._pagesize
1944 nextfirstid = pagenum * self._pagesize + self._pagesize
1945 if start >= nextfirstid:
1950 page_results = self._cache.get(pagenum)
1951 if page_results is None:
1952 page_results = list(self._pagefunc(pagenum))
1954 self._cache[pagenum] = page_results
1957 start % self._pagesize
1958 if firstid <= start < nextfirstid
1962 ((end - 1) % self._pagesize) + 1
1963 if (end is not None and firstid <= end <= nextfirstid)
1966 if startv != 0 or endv is not None:
1967 page_results = page_results[startv:endv]
1968 res.extend(page_results)
1970 # A little optimization - if current page is not "full", ie. does
1971 # not contain page_size videos then we can assume that this page
1972 # is the last one - there are no more ids on further pages -
1973 # i.e. no need to query again.
1974 if len(page_results) + startv < self._pagesize:
1977 # If we got the whole page, but the next page is not interesting,
1978 # break out early as well
1979 if end == nextfirstid:
1984 class InAdvancePagedList(PagedList):
1985 def __init__(self, pagefunc, pagecount, pagesize):
1986 self._pagefunc = pagefunc
1987 self._pagecount = pagecount
1988 self._pagesize = pagesize
1990 def getslice(self, start=0, end=None):
1992 start_page = start // self._pagesize
1994 self._pagecount if end is None else (end // self._pagesize + 1))
1995 skip_elems = start - start_page * self._pagesize
1996 only_more = None if end is None else end - start
1997 for pagenum in range(start_page, end_page):
1998 page = list(self._pagefunc(pagenum))
2000 page = page[skip_elems:]
2002 if only_more is not None:
2003 if len(page) < only_more:
2004 only_more -= len(page)
2006 page = page[:only_more]
2013 def uppercase_escape(s):
2014 unicode_escape = codecs.getdecoder('unicode_escape')
2016 r'\\U[0-9a-fA-F]{8}',
2017 lambda m: unicode_escape(m.group(0))[0],
2021 def lowercase_escape(s):
2022 unicode_escape = codecs.getdecoder('unicode_escape')
2024 r'\\u[0-9a-fA-F]{4}',
2025 lambda m: unicode_escape(m.group(0))[0],
2029 def escape_rfc3986(s):
2030 """Escape non-ASCII characters as suggested by RFC 3986"""
2031 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2032 s = s.encode('utf-8')
2033 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2036 def escape_url(url):
2037 """Escape URL as suggested by RFC 3986"""
2038 url_parsed = compat_urllib_parse_urlparse(url)
2039 return url_parsed._replace(
2040 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2041 path=escape_rfc3986(url_parsed.path),
2042 params=escape_rfc3986(url_parsed.params),
2043 query=escape_rfc3986(url_parsed.query),
2044 fragment=escape_rfc3986(url_parsed.fragment)
2048 def read_batch_urls(batch_fd):
2050 if not isinstance(url, compat_str):
2051 url = url.decode('utf-8', 'replace')
2052 BOM_UTF8 = '\xef\xbb\xbf'
2053 if url.startswith(BOM_UTF8):
2054 url = url[len(BOM_UTF8):]
2056 if url.startswith(('#', ';', ']')):
2060 with contextlib.closing(batch_fd) as fd:
2061 return [url for url in map(fixup, fd) if url]
2064 def urlencode_postdata(*args, **kargs):
2065 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2068 def update_url_query(url, query):
2071 parsed_url = compat_urlparse.urlparse(url)
2072 qs = compat_parse_qs(parsed_url.query)
2074 return compat_urlparse.urlunparse(parsed_url._replace(
2075 query=compat_urllib_parse_urlencode(qs, True)))
2078 def update_Request(req, url=None, data=None, headers={}, query={}):
2079 req_headers = req.headers.copy()
2080 req_headers.update(headers)
2081 req_data = data or req.data
2082 req_url = update_url_query(url or req.get_full_url(), query)
2083 req_get_method = req.get_method()
2084 if req_get_method == 'HEAD':
2085 req_type = HEADRequest
2086 elif req_get_method == 'PUT':
2087 req_type = PUTRequest
2089 req_type = compat_urllib_request.Request
2091 req_url, data=req_data, headers=req_headers,
2092 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2093 if hasattr(req, 'timeout'):
2094 new_req.timeout = req.timeout
2098 def _multipart_encode_impl(data, boundary):
2099 content_type = 'multipart/form-data; boundary=%s' % boundary
2102 for k, v in data.items():
2103 out += b'--' + boundary.encode('ascii') + b'\r\n'
2104 if isinstance(k, compat_str):
2105 k = k.encode('utf-8')
2106 if isinstance(v, compat_str):
2107 v = v.encode('utf-8')
2108 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2109 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2110 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2111 if boundary.encode('ascii') in content:
2112 raise ValueError('Boundary overlaps with data')
2115 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2117 return out, content_type
2120 def multipart_encode(data, boundary=None):
2122 Encode a dict to RFC 7578-compliant form-data
2125 A dict where keys and values can be either Unicode or bytes-like
2128 If specified a Unicode object, it's used as the boundary. Otherwise
2129 a random boundary is generated.
2131 Reference: https://tools.ietf.org/html/rfc7578
2133 has_specified_boundary = boundary is not None
2136 if boundary is None:
2137 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2140 out, content_type = _multipart_encode_impl(data, boundary)
2143 if has_specified_boundary:
2147 return out, content_type
2150 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2151 if isinstance(key_or_keys, (list, tuple)):
2152 for key in key_or_keys:
2153 if key not in d or d[key] is None or skip_false_values and not d[key]:
2157 return d.get(key_or_keys, default)
2160 def try_get(src, getter, expected_type=None):
2161 if not isinstance(getter, (list, tuple)):
2166 except (AttributeError, KeyError, TypeError, IndexError):
2169 if expected_type is None or isinstance(v, expected_type):
2173 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2174 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2186 TV_PARENTAL_GUIDELINES = {
2196 def parse_age_limit(s):
2198 return s if 0 <= s <= 21 else None
2199 if not isinstance(s, compat_basestring):
2201 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2203 return int(m.group('age'))
2205 return US_RATINGS[s]
2206 return TV_PARENTAL_GUIDELINES.get(s)
2209 def strip_jsonp(code):
2212 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2213 (?:\s*&&\s*(?P=func_name))?
2214 \s*\(\s*(?P<callback_data>.*)\);?
2215 \s*?(?://[^\n]*)*$''',
2216 r'\g<callback_data>', code)
2219 def js_to_json(code):
2220 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2221 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2223 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2224 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2229 if v in ('true', 'false', 'null'):
2231 elif v.startswith('/*') or v.startswith('//') or v == ',':
2234 if v[0] in ("'", '"'):
2235 v = re.sub(r'(?s)\\.|"', lambda m: {
2240 }.get(m.group(0), m.group(0)), v[1:-1])
2242 for regex, base in INTEGER_TABLE:
2243 im = re.match(regex, v)
2245 i = int(im.group(1), base)
2246 return '"%d":' % i if v.endswith(':') else '%d' % i
2250 return re.sub(r'''(?sx)
2251 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2252 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2253 {comment}|,(?={skip}[\]}}])|
2254 [a-zA-Z_][.a-zA-Z_0-9]*|
2255 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2257 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2260 def qualities(quality_ids):
2261 """ Get a numeric quality value out of a list of possible values """
2264 return quality_ids.index(qid)
2270 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2273 def limit_length(s, length):
2274 """ Add ellipses to overly long strings """
2279 return s[:length - len(ELLIPSES)] + ELLIPSES
2283 def version_tuple(v):
2284 return tuple(int(e) for e in re.split(r'[-.]', v))
2287 def is_outdated_version(version, limit, assume_new=True):
2289 return not assume_new
2291 return version_tuple(version) < version_tuple(limit)
2293 return not assume_new
2296 def ytdl_is_updateable():
2297 """ Returns if youtube-dl can be updated with -U """
2298 from zipimport import zipimporter
2300 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2303 def args_to_str(args):
2304 # Get a short string representation for a subprocess command
2305 return ' '.join(compat_shlex_quote(a) for a in args)
2308 def error_to_compat_str(err):
2310 # On python 2 error byte string must be decoded with proper
2311 # encoding rather than ascii
2312 if sys.version_info[0] < 3:
2313 err_str = err_str.decode(preferredencoding())
2317 def mimetype2ext(mt):
2323 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2324 # it's the most popular one
2325 'audio/mpeg': 'mp3',
2330 _, _, res = mt.rpartition('/')
2331 res = res.split(';')[0].strip().lower()
2335 'smptett+xml': 'tt',
2339 'x-mp4-fragmented': 'mp4',
2342 'x-mpegurl': 'm3u8',
2343 'vnd.apple.mpegurl': 'm3u8',
2347 'vnd.ms-sstr+xml': 'ism',
2353 def parse_codecs(codecs_str):
2354 # http://tools.ietf.org/html/rfc6381
2357 splited_codecs = list(filter(None, map(
2358 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2359 vcodec, acodec = None, None
2360 for full_codec in splited_codecs:
2361 codec = full_codec.split('.')[0]
2362 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2365 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2369 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2370 if not vcodec and not acodec:
2371 if len(splited_codecs) == 2:
2376 elif len(splited_codecs) == 1:
2383 'vcodec': vcodec or 'none',
2384 'acodec': acodec or 'none',
2389 def urlhandle_detect_ext(url_handle):
2390 getheader = url_handle.headers.get
2392 cd = getheader('Content-Disposition')
2394 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2396 e = determine_ext(m.group('filename'), default_ext=None)
2400 return mimetype2ext(getheader('Content-Type'))
2403 def encode_data_uri(data, mime_type):
2404 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2407 def age_restricted(content_limit, age_limit):
2408 """ Returns True iff the content should be blocked """
2410 if age_limit is None: # No limit set
2412 if content_limit is None:
2413 return False # Content available for everyone
2414 return age_limit < content_limit
2417 def is_html(first_bytes):
2418 """ Detect whether a file contains HTML by examining its first bytes. """
2421 (b'\xef\xbb\xbf', 'utf-8'),
2422 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2423 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2424 (b'\xff\xfe', 'utf-16-le'),
2425 (b'\xfe\xff', 'utf-16-be'),
2427 for bom, enc in BOMS:
2428 if first_bytes.startswith(bom):
2429 s = first_bytes[len(bom):].decode(enc, 'replace')
2432 s = first_bytes.decode('utf-8', 'replace')
2434 return re.match(r'^\s*<', s)
2437 def determine_protocol(info_dict):
2438 protocol = info_dict.get('protocol')
2439 if protocol is not None:
2442 url = info_dict['url']
2443 if url.startswith('rtmp'):
2445 elif url.startswith('mms'):
2447 elif url.startswith('rtsp'):
2450 ext = determine_ext(url)
2456 return compat_urllib_parse_urlparse(url).scheme
2459 def render_table(header_row, data):
2460 """ Render a list of rows, each as a list of values """
2461 table = [header_row] + data
2462 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2463 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2464 return '\n'.join(format_str % tuple(row) for row in table)
2467 def _match_one(filter_part, dct):
2468 COMPARISON_OPERATORS = {
2476 operator_rex = re.compile(r'''(?x)\s*
2478 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2480 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2481 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2482 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2485 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2486 m = operator_rex.search(filter_part)
2488 op = COMPARISON_OPERATORS[m.group('op')]
2489 actual_value = dct.get(m.group('key'))
2490 if (m.group('quotedstrval') is not None or
2491 m.group('strval') is not None or
2492 # If the original field is a string and matching comparisonvalue is
2493 # a number we should respect the origin of the original field
2494 # and process comparison value as a string (see
2495 # https://github.com/rg3/youtube-dl/issues/11082).
2496 actual_value is not None and m.group('intval') is not None and
2497 isinstance(actual_value, compat_str)):
2498 if m.group('op') not in ('=', '!='):
2500 'Operator %s does not support string values!' % m.group('op'))
2501 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2502 quote = m.group('quote')
2503 if quote is not None:
2504 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2507 comparison_value = int(m.group('intval'))
2509 comparison_value = parse_filesize(m.group('intval'))
2510 if comparison_value is None:
2511 comparison_value = parse_filesize(m.group('intval') + 'B')
2512 if comparison_value is None:
2514 'Invalid integer value %r in filter part %r' % (
2515 m.group('intval'), filter_part))
2516 if actual_value is None:
2517 return m.group('none_inclusive')
2518 return op(actual_value, comparison_value)
2521 '': lambda v: v is not None,
2522 '!': lambda v: v is None,
2524 operator_rex = re.compile(r'''(?x)\s*
2525 (?P<op>%s)\s*(?P<key>[a-z_]+)
2527 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2528 m = operator_rex.search(filter_part)
2530 op = UNARY_OPERATORS[m.group('op')]
2531 actual_value = dct.get(m.group('key'))
2532 return op(actual_value)
2534 raise ValueError('Invalid filter part %r' % filter_part)
2537 def match_str(filter_str, dct):
2538 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2541 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2544 def match_filter_func(filter_str):
2545 def _match_func(info_dict):
2546 if match_str(filter_str, info_dict):
2549 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2550 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2554 def parse_dfxp_time_expr(time_expr):
2558 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2560 return float(mobj.group('time_offset'))
2562 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2564 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2567 def srt_subtitles_timecode(seconds):
2568 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2571 def dfxp2srt(dfxp_data):
2572 LEGACY_NAMESPACES = (
2573 ('http://www.w3.org/ns/ttml', [
2574 'http://www.w3.org/2004/11/ttaf1',
2575 'http://www.w3.org/2006/04/ttaf1',
2576 'http://www.w3.org/2006/10/ttaf1',
2578 ('http://www.w3.org/ns/ttml#styling', [
2579 'http://www.w3.org/ns/ttml#style',
2583 SUPPORTED_STYLING = [
2592 _x = functools.partial(xpath_with_ns, ns_map={
2593 'ttml': 'http://www.w3.org/ns/ttml',
2594 'tts': 'http://www.w3.org/ns/ttml#styling',
2600 class TTMLPElementParser(object):
2602 _unclosed_elements = []
2603 _applied_styles = []
2605 def start(self, tag, attrib):
2606 if tag in (_x('ttml:br'), 'br'):
2609 unclosed_elements = []
2611 element_style_id = attrib.get('style')
2613 style.update(default_style)
2614 if element_style_id:
2615 style.update(styles.get(element_style_id, {}))
2616 for prop in SUPPORTED_STYLING:
2617 prop_val = attrib.get(_x('tts:' + prop))
2619 style[prop] = prop_val
2622 for k, v in sorted(style.items()):
2623 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2626 font += ' color="%s"' % v
2627 elif k == 'fontSize':
2628 font += ' size="%s"' % v
2629 elif k == 'fontFamily':
2630 font += ' face="%s"' % v
2631 elif k == 'fontWeight' and v == 'bold':
2633 unclosed_elements.append('b')
2634 elif k == 'fontStyle' and v == 'italic':
2636 unclosed_elements.append('i')
2637 elif k == 'textDecoration' and v == 'underline':
2639 unclosed_elements.append('u')
2641 self._out += '<font' + font + '>'
2642 unclosed_elements.append('font')
2644 if self._applied_styles:
2645 applied_style.update(self._applied_styles[-1])
2646 applied_style.update(style)
2647 self._applied_styles.append(applied_style)
2648 self._unclosed_elements.append(unclosed_elements)
2651 if tag not in (_x('ttml:br'), 'br'):
2652 unclosed_elements = self._unclosed_elements.pop()
2653 for element in reversed(unclosed_elements):
2654 self._out += '</%s>' % element
2655 if unclosed_elements and self._applied_styles:
2656 self._applied_styles.pop()
2658 def data(self, data):
2662 return self._out.strip()
2664 def parse_node(node):
2665 target = TTMLPElementParser()
2666 parser = xml.etree.ElementTree.XMLParser(target=target)
2667 parser.feed(xml.etree.ElementTree.tostring(node))
2668 return parser.close()
2670 for k, v in LEGACY_NAMESPACES:
2672 dfxp_data = dfxp_data.replace(ns, k)
2674 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2676 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2679 raise ValueError('Invalid dfxp/TTML subtitle')
2683 for style in dfxp.findall(_x('.//ttml:style')):
2684 style_id = style.get('id')
2685 parent_style_id = style.get('style')
2687 if parent_style_id not in styles:
2690 styles[style_id] = styles[parent_style_id].copy()
2691 for prop in SUPPORTED_STYLING:
2692 prop_val = style.get(_x('tts:' + prop))
2694 styles.setdefault(style_id, {})[prop] = prop_val
2700 for p in ('body', 'div'):
2701 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2704 style = styles.get(ele.get('style'))
2707 default_style.update(style)
2709 for para, index in zip(paras, itertools.count(1)):
2710 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2711 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2712 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2713 if begin_time is None:
2718 end_time = begin_time + dur
2719 out.append('%d\n%s --> %s\n%s\n\n' % (
2721 srt_subtitles_timecode(begin_time),
2722 srt_subtitles_timecode(end_time),
2728 def cli_option(params, command_option, param):
2729 param = params.get(param)
2731 param = compat_str(param)
2732 return [command_option, param] if param is not None else []
2735 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2736 param = params.get(param)
2737 assert isinstance(param, bool)
2739 return [command_option + separator + (true_value if param else false_value)]
2740 return [command_option, true_value if param else false_value]
2743 def cli_valueless_option(params, command_option, param, expected_value=True):
2744 param = params.get(param)
2745 return [command_option] if param == expected_value else []
2748 def cli_configuration_args(params, param, default=[]):
2749 ex_args = params.get(param)
2752 assert isinstance(ex_args, list)
2756 class ISO639Utils(object):
2757 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2946 def short2long(cls, code):
2947 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2948 return cls._lang_map.get(code[:2])
2951 def long2short(cls, code):
2952 """Convert language code from ISO 639-2/T to ISO 639-1"""
2953 for short_name, long_name in cls._lang_map.items():
2954 if long_name == code:
2958 class ISO3166Utils(object):
2959 # From http://data.okfn.org/data/core/country-list
2961 'AF': 'Afghanistan',
2962 'AX': 'Åland Islands',
2965 'AS': 'American Samoa',
2970 'AG': 'Antigua and Barbuda',
2987 'BO': 'Bolivia, Plurinational State of',
2988 'BQ': 'Bonaire, Sint Eustatius and Saba',
2989 'BA': 'Bosnia and Herzegovina',
2991 'BV': 'Bouvet Island',
2993 'IO': 'British Indian Ocean Territory',
2994 'BN': 'Brunei Darussalam',
2996 'BF': 'Burkina Faso',
3002 'KY': 'Cayman Islands',
3003 'CF': 'Central African Republic',
3007 'CX': 'Christmas Island',
3008 'CC': 'Cocos (Keeling) Islands',
3012 'CD': 'Congo, the Democratic Republic of the',
3013 'CK': 'Cook Islands',
3015 'CI': 'Côte d\'Ivoire',
3020 'CZ': 'Czech Republic',
3024 'DO': 'Dominican Republic',
3027 'SV': 'El Salvador',
3028 'GQ': 'Equatorial Guinea',
3032 'FK': 'Falkland Islands (Malvinas)',
3033 'FO': 'Faroe Islands',
3037 'GF': 'French Guiana',
3038 'PF': 'French Polynesia',
3039 'TF': 'French Southern Territories',
3054 'GW': 'Guinea-Bissau',
3057 'HM': 'Heard Island and McDonald Islands',
3058 'VA': 'Holy See (Vatican City State)',
3065 'IR': 'Iran, Islamic Republic of',
3068 'IM': 'Isle of Man',
3078 'KP': 'Korea, Democratic People\'s Republic of',
3079 'KR': 'Korea, Republic of',
3082 'LA': 'Lao People\'s Democratic Republic',
3088 'LI': 'Liechtenstein',
3092 'MK': 'Macedonia, the Former Yugoslav Republic of',
3099 'MH': 'Marshall Islands',
3105 'FM': 'Micronesia, Federated States of',
3106 'MD': 'Moldova, Republic of',
3117 'NL': 'Netherlands',
3118 'NC': 'New Caledonia',
3119 'NZ': 'New Zealand',
3124 'NF': 'Norfolk Island',
3125 'MP': 'Northern Mariana Islands',
3130 'PS': 'Palestine, State of',
3132 'PG': 'Papua New Guinea',
3135 'PH': 'Philippines',
3139 'PR': 'Puerto Rico',
3143 'RU': 'Russian Federation',
3145 'BL': 'Saint Barthélemy',
3146 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3147 'KN': 'Saint Kitts and Nevis',
3148 'LC': 'Saint Lucia',
3149 'MF': 'Saint Martin (French part)',
3150 'PM': 'Saint Pierre and Miquelon',
3151 'VC': 'Saint Vincent and the Grenadines',
3154 'ST': 'Sao Tome and Principe',
3155 'SA': 'Saudi Arabia',
3159 'SL': 'Sierra Leone',
3161 'SX': 'Sint Maarten (Dutch part)',
3164 'SB': 'Solomon Islands',
3166 'ZA': 'South Africa',
3167 'GS': 'South Georgia and the South Sandwich Islands',
3168 'SS': 'South Sudan',
3173 'SJ': 'Svalbard and Jan Mayen',
3176 'CH': 'Switzerland',
3177 'SY': 'Syrian Arab Republic',
3178 'TW': 'Taiwan, Province of China',
3180 'TZ': 'Tanzania, United Republic of',
3182 'TL': 'Timor-Leste',
3186 'TT': 'Trinidad and Tobago',
3189 'TM': 'Turkmenistan',
3190 'TC': 'Turks and Caicos Islands',
3194 'AE': 'United Arab Emirates',
3195 'GB': 'United Kingdom',
3196 'US': 'United States',
3197 'UM': 'United States Minor Outlying Islands',
3201 'VE': 'Venezuela, Bolivarian Republic of',
3203 'VG': 'Virgin Islands, British',
3204 'VI': 'Virgin Islands, U.S.',
3205 'WF': 'Wallis and Futuna',
3206 'EH': 'Western Sahara',
3213 def short2full(cls, code):
3214 """Convert an ISO 3166-2 country code to the corresponding full name"""
3215 return cls._country_map.get(code.upper())
3218 class GeoUtils(object):
3219 # Major IPv4 address blocks per country
3221 'AD': '85.94.160.0/19',
3222 'AE': '94.200.0.0/13',
3223 'AF': '149.54.0.0/17',
3224 'AG': '209.59.64.0/18',
3225 'AI': '204.14.248.0/21',
3226 'AL': '46.99.0.0/16',
3227 'AM': '46.70.0.0/15',
3228 'AO': '105.168.0.0/13',
3229 'AP': '159.117.192.0/21',
3230 'AR': '181.0.0.0/12',
3231 'AS': '202.70.112.0/20',
3232 'AT': '84.112.0.0/13',
3233 'AU': '1.128.0.0/11',
3234 'AW': '181.41.0.0/18',
3235 'AZ': '5.191.0.0/16',
3236 'BA': '31.176.128.0/17',
3237 'BB': '65.48.128.0/17',
3238 'BD': '114.130.0.0/16',
3240 'BF': '129.45.128.0/17',
3241 'BG': '95.42.0.0/15',
3242 'BH': '37.131.0.0/17',
3243 'BI': '154.117.192.0/18',
3244 'BJ': '137.255.0.0/16',
3245 'BL': '192.131.134.0/24',
3246 'BM': '196.12.64.0/18',
3247 'BN': '156.31.0.0/16',
3248 'BO': '161.56.0.0/16',
3249 'BQ': '161.0.80.0/20',
3250 'BR': '152.240.0.0/12',
3251 'BS': '24.51.64.0/18',
3252 'BT': '119.2.96.0/19',
3253 'BW': '168.167.0.0/16',
3254 'BY': '178.120.0.0/13',
3255 'BZ': '179.42.192.0/18',
3256 'CA': '99.224.0.0/11',
3257 'CD': '41.243.0.0/16',
3258 'CF': '196.32.200.0/21',
3259 'CG': '197.214.128.0/17',
3260 'CH': '85.0.0.0/13',
3261 'CI': '154.232.0.0/14',
3262 'CK': '202.65.32.0/19',
3263 'CL': '152.172.0.0/14',
3264 'CM': '165.210.0.0/15',
3265 'CN': '36.128.0.0/10',
3266 'CO': '181.240.0.0/12',
3267 'CR': '201.192.0.0/12',
3268 'CU': '152.206.0.0/15',
3269 'CV': '165.90.96.0/19',
3270 'CW': '190.88.128.0/17',
3271 'CY': '46.198.0.0/15',
3272 'CZ': '88.100.0.0/14',
3274 'DJ': '197.241.0.0/17',
3275 'DK': '87.48.0.0/12',
3276 'DM': '192.243.48.0/20',
3277 'DO': '152.166.0.0/15',
3278 'DZ': '41.96.0.0/12',
3279 'EC': '186.68.0.0/15',
3280 'EE': '90.190.0.0/15',
3281 'EG': '156.160.0.0/11',
3282 'ER': '196.200.96.0/20',
3283 'ES': '88.0.0.0/11',
3284 'ET': '196.188.0.0/14',
3285 'EU': '2.16.0.0/13',
3286 'FI': '91.152.0.0/13',
3287 'FJ': '144.120.0.0/16',
3288 'FM': '119.252.112.0/20',
3289 'FO': '88.85.32.0/19',
3291 'GA': '41.158.0.0/15',
3293 'GD': '74.122.88.0/21',
3294 'GE': '31.146.0.0/16',
3295 'GF': '161.22.64.0/18',
3296 'GG': '62.68.160.0/19',
3297 'GH': '45.208.0.0/14',
3298 'GI': '85.115.128.0/19',
3299 'GL': '88.83.0.0/19',
3300 'GM': '160.182.0.0/15',
3301 'GN': '197.149.192.0/18',
3302 'GP': '104.250.0.0/19',
3303 'GQ': '105.235.224.0/20',
3304 'GR': '94.64.0.0/13',
3305 'GT': '168.234.0.0/16',
3306 'GU': '168.123.0.0/16',
3307 'GW': '197.214.80.0/20',
3308 'GY': '181.41.64.0/18',
3309 'HK': '113.252.0.0/14',
3310 'HN': '181.210.0.0/16',
3311 'HR': '93.136.0.0/13',
3312 'HT': '148.102.128.0/17',
3313 'HU': '84.0.0.0/14',
3314 'ID': '39.192.0.0/10',
3315 'IE': '87.32.0.0/12',
3316 'IL': '79.176.0.0/13',
3317 'IM': '5.62.80.0/20',
3318 'IN': '117.192.0.0/10',
3319 'IO': '203.83.48.0/21',
3320 'IQ': '37.236.0.0/14',
3321 'IR': '2.176.0.0/12',
3322 'IS': '82.221.0.0/16',
3323 'IT': '79.0.0.0/10',
3324 'JE': '87.244.64.0/18',
3325 'JM': '72.27.0.0/17',
3326 'JO': '176.29.0.0/16',
3327 'JP': '126.0.0.0/8',
3328 'KE': '105.48.0.0/12',
3329 'KG': '158.181.128.0/17',
3330 'KH': '36.37.128.0/17',
3331 'KI': '103.25.140.0/22',
3332 'KM': '197.255.224.0/20',
3333 'KN': '198.32.32.0/19',
3334 'KP': '175.45.176.0/22',
3335 'KR': '175.192.0.0/10',
3336 'KW': '37.36.0.0/14',
3337 'KY': '64.96.0.0/15',
3338 'KZ': '2.72.0.0/13',
3339 'LA': '115.84.64.0/18',
3340 'LB': '178.135.0.0/16',
3341 'LC': '192.147.231.0/24',
3342 'LI': '82.117.0.0/19',
3343 'LK': '112.134.0.0/15',
3344 'LR': '41.86.0.0/19',
3345 'LS': '129.232.0.0/17',
3346 'LT': '78.56.0.0/13',
3347 'LU': '188.42.0.0/16',
3348 'LV': '46.109.0.0/16',
3349 'LY': '41.252.0.0/14',
3350 'MA': '105.128.0.0/11',
3351 'MC': '88.209.64.0/18',
3352 'MD': '37.246.0.0/16',
3353 'ME': '178.175.0.0/17',
3354 'MF': '74.112.232.0/21',
3355 'MG': '154.126.0.0/17',
3356 'MH': '117.103.88.0/21',
3357 'MK': '77.28.0.0/15',
3358 'ML': '154.118.128.0/18',
3359 'MM': '37.111.0.0/17',
3360 'MN': '49.0.128.0/17',
3361 'MO': '60.246.0.0/16',
3362 'MP': '202.88.64.0/20',
3363 'MQ': '109.203.224.0/19',
3364 'MR': '41.188.64.0/18',
3365 'MS': '208.90.112.0/22',
3366 'MT': '46.11.0.0/16',
3367 'MU': '105.16.0.0/12',
3368 'MV': '27.114.128.0/18',
3369 'MW': '105.234.0.0/16',
3370 'MX': '187.192.0.0/11',
3371 'MY': '175.136.0.0/13',
3372 'MZ': '197.218.0.0/15',
3373 'NA': '41.182.0.0/16',
3374 'NC': '101.101.0.0/18',
3375 'NE': '197.214.0.0/18',
3376 'NF': '203.17.240.0/22',
3377 'NG': '105.112.0.0/12',
3378 'NI': '186.76.0.0/15',
3379 'NL': '145.96.0.0/11',
3380 'NO': '84.208.0.0/13',
3381 'NP': '36.252.0.0/15',
3382 'NR': '203.98.224.0/19',
3383 'NU': '49.156.48.0/22',
3384 'NZ': '49.224.0.0/14',
3385 'OM': '5.36.0.0/15',
3386 'PA': '186.72.0.0/15',
3387 'PE': '186.160.0.0/14',
3388 'PF': '123.50.64.0/18',
3389 'PG': '124.240.192.0/19',
3390 'PH': '49.144.0.0/13',
3391 'PK': '39.32.0.0/11',
3392 'PL': '83.0.0.0/11',
3393 'PM': '70.36.0.0/20',
3394 'PR': '66.50.0.0/16',
3395 'PS': '188.161.0.0/16',
3396 'PT': '85.240.0.0/13',
3397 'PW': '202.124.224.0/20',
3398 'PY': '181.120.0.0/14',
3399 'QA': '37.210.0.0/15',
3400 'RE': '139.26.0.0/16',
3401 'RO': '79.112.0.0/13',
3402 'RS': '178.220.0.0/14',
3403 'RU': '5.136.0.0/13',
3404 'RW': '105.178.0.0/15',
3405 'SA': '188.48.0.0/13',
3406 'SB': '202.1.160.0/19',
3407 'SC': '154.192.0.0/11',
3408 'SD': '154.96.0.0/13',
3409 'SE': '78.64.0.0/12',
3410 'SG': '152.56.0.0/14',
3411 'SI': '188.196.0.0/14',
3412 'SK': '78.98.0.0/15',
3413 'SL': '197.215.0.0/17',
3414 'SM': '89.186.32.0/19',
3415 'SN': '41.82.0.0/15',
3416 'SO': '197.220.64.0/19',
3417 'SR': '186.179.128.0/17',
3418 'SS': '105.235.208.0/21',
3419 'ST': '197.159.160.0/19',
3420 'SV': '168.243.0.0/16',
3421 'SX': '190.102.0.0/20',
3423 'SZ': '41.84.224.0/19',
3424 'TC': '65.255.48.0/20',
3425 'TD': '154.68.128.0/19',
3426 'TG': '196.168.0.0/14',
3427 'TH': '171.96.0.0/13',
3428 'TJ': '85.9.128.0/18',
3429 'TK': '27.96.24.0/21',
3430 'TL': '180.189.160.0/20',
3431 'TM': '95.85.96.0/19',
3432 'TN': '197.0.0.0/11',
3433 'TO': '175.176.144.0/21',
3434 'TR': '78.160.0.0/11',
3435 'TT': '186.44.0.0/15',
3436 'TV': '202.2.96.0/19',
3437 'TW': '120.96.0.0/11',
3438 'TZ': '156.156.0.0/14',
3439 'UA': '93.72.0.0/13',
3440 'UG': '154.224.0.0/13',
3442 'UY': '167.56.0.0/13',
3443 'UZ': '82.215.64.0/18',
3444 'VA': '212.77.0.0/19',
3445 'VC': '24.92.144.0/20',
3446 'VE': '186.88.0.0/13',
3447 'VG': '172.103.64.0/18',
3448 'VI': '146.226.0.0/16',
3449 'VN': '14.160.0.0/11',
3450 'VU': '202.80.32.0/20',
3451 'WF': '117.20.32.0/21',
3452 'WS': '202.4.32.0/19',
3453 'YE': '134.35.0.0/16',
3454 'YT': '41.242.116.0/22',
3455 'ZA': '41.0.0.0/11',
3456 'ZM': '165.56.0.0/13',
3457 'ZW': '41.85.192.0/19',
3461 def random_ipv4(cls, code):
3462 block = cls._country_ip_map.get(code.upper())
3465 addr, preflen = block.split('/')
3466 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3467 addr_max = addr_min | (0xffffffff >> int(preflen))
3468 return compat_str(socket.inet_ntoa(
3469 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3472 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3473 def __init__(self, proxies=None):
3474 # Set default handlers
3475 for type in ('http', 'https'):
3476 setattr(self, '%s_open' % type,
3477 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3478 meth(r, proxy, type))
3479 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3481 def proxy_open(self, req, proxy, type):
3482 req_proxy = req.headers.get('Ytdl-request-proxy')
3483 if req_proxy is not None:
3485 del req.headers['Ytdl-request-proxy']
3487 if proxy == '__noproxy__':
3488 return None # No Proxy
3489 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3490 req.add_header('Ytdl-socks-proxy', proxy)
3491 # youtube-dl's http/https handlers do wrapping the socket with socks
3493 return compat_urllib_request.ProxyHandler.proxy_open(
3494 self, req, proxy, type)
3497 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3498 # released into Public Domain
3499 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3501 def long_to_bytes(n, blocksize=0):
3502 """long_to_bytes(n:long, blocksize:int) : string
3503 Convert a long integer to a byte string.
3505 If optional blocksize is given and greater than zero, pad the front of the
3506 byte string with binary zeros so that the length is a multiple of
3509 # after much testing, this algorithm was deemed to be the fastest
3513 s = compat_struct_pack('>I', n & 0xffffffff) + s
3515 # strip off leading zeros
3516 for i in range(len(s)):
3517 if s[i] != b'\000'[0]:
3520 # only happens when n == 0
3524 # add back some pad bytes. this could be done more efficiently w.r.t. the
3525 # de-padding being done above, but sigh...
3526 if blocksize > 0 and len(s) % blocksize:
3527 s = (blocksize - len(s) % blocksize) * b'\000' + s
3531 def bytes_to_long(s):
3532 """bytes_to_long(string) : long
3533 Convert a byte string to a long integer.
3535 This is (essentially) the inverse of long_to_bytes().
3540 extra = (4 - length % 4)
3541 s = b'\000' * extra + s
3542 length = length + extra
3543 for i in range(0, length, 4):
3544 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3548 def ohdave_rsa_encrypt(data, exponent, modulus):
3550 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3553 data: data to encrypt, bytes-like object
3554 exponent, modulus: parameter e and N of RSA algorithm, both integer
3555 Output: hex string of encrypted data
3557 Limitation: supports one block encryption only
3560 payload = int(binascii.hexlify(data[::-1]), 16)
3561 encrypted = pow(payload, exponent, modulus)
3562 return '%x' % encrypted
3565 def pkcs1pad(data, length):
3567 Padding input data with PKCS#1 scheme
3569 @param {int[]} data input data
3570 @param {int} length target length
3571 @returns {int[]} padded data
3573 if len(data) > length - 11:
3574 raise ValueError('Input data too long for PKCS#1 padding')
3576 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3577 return [0, 2] + pseudo_random + [0] + data
3580 def encode_base_n(num, n, table=None):
3581 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3583 table = FULL_TABLE[:n]
3586 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3593 ret = table[num % n] + ret
3598 def decode_packed_codes(code):
3599 mobj = re.search(PACKED_CODES_RE, code)
3600 obfucasted_code, base, count, symbols = mobj.groups()
3603 symbols = symbols.split('|')
3608 base_n_count = encode_base_n(count, base)
3609 symbol_table[base_n_count] = symbols[count] or base_n_count
3612 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3616 def parse_m3u8_attributes(attrib):
3618 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3619 if val.startswith('"'):
3625 def urshift(val, n):
3626 return val >> n if val >= 0 else (val + 0x100000000) >> n
3629 # Based on png2str() written by @gdkchan and improved by @yokrysty
3630 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3631 def decode_png(png_data):
3632 # Reference: https://www.w3.org/TR/PNG/
3633 header = png_data[8:]
3635 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3636 raise IOError('Not a valid PNG file.')
3638 int_map = {1: '>B', 2: '>H', 4: '>I'}
3639 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3644 length = unpack_integer(header[:4])
3647 chunk_type = header[:4]
3650 chunk_data = header[:length]
3651 header = header[length:]
3653 header = header[4:] # Skip CRC
3661 ihdr = chunks[0]['data']
3663 width = unpack_integer(ihdr[:4])
3664 height = unpack_integer(ihdr[4:8])
3668 for chunk in chunks:
3669 if chunk['type'] == b'IDAT':
3670 idat += chunk['data']
3673 raise IOError('Unable to read PNG data.')
3675 decompressed_data = bytearray(zlib.decompress(idat))
3680 def _get_pixel(idx):
3685 for y in range(height):
3686 basePos = y * (1 + stride)
3687 filter_type = decompressed_data[basePos]
3691 pixels.append(current_row)
3693 for x in range(stride):
3694 color = decompressed_data[1 + basePos + x]
3695 basex = y * stride + x
3700 left = _get_pixel(basex - 3)
3702 up = _get_pixel(basex - stride)
3704 if filter_type == 1: # Sub
3705 color = (color + left) & 0xff
3706 elif filter_type == 2: # Up
3707 color = (color + up) & 0xff
3708 elif filter_type == 3: # Average
3709 color = (color + ((left + up) >> 1)) & 0xff
3710 elif filter_type == 4: # Paeth
3716 c = _get_pixel(basex - stride - 3)
3724 if pa <= pb and pa <= pc:
3725 color = (color + a) & 0xff
3727 color = (color + b) & 0xff
3729 color = (color + c) & 0xff
3731 current_row.append(color)
3733 return width, height, pixels
3736 def write_xattr(path, key, value):
3737 # This mess below finds the best xattr tool for the job
3739 # try the pyxattr module...
3742 if hasattr(xattr, 'set'): # pyxattr
3743 # Unicode arguments are not supported in python-pyxattr until
3745 # See https://github.com/rg3/youtube-dl/issues/5498
3746 pyxattr_required_version = '0.5.0'
3747 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3748 # TODO: fallback to CLI tools
3749 raise XAttrUnavailableError(
3750 'python-pyxattr is detected but is too old. '
3751 'youtube-dl requires %s or above while your version is %s. '
3752 'Falling back to other xattr implementations' % (
3753 pyxattr_required_version, xattr.__version__))
3755 setxattr = xattr.set
3757 setxattr = xattr.setxattr
3760 setxattr(path, key, value)
3761 except EnvironmentError as e:
3762 raise XAttrMetadataError(e.errno, e.strerror)
3765 if compat_os_name == 'nt':
3766 # Write xattrs to NTFS Alternate Data Streams:
3767 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3768 assert ':' not in key
3769 assert os.path.exists(path)
3771 ads_fn = path + ':' + key
3773 with open(ads_fn, 'wb') as f:
3775 except EnvironmentError as e:
3776 raise XAttrMetadataError(e.errno, e.strerror)
3778 user_has_setfattr = check_executable('setfattr', ['--version'])
3779 user_has_xattr = check_executable('xattr', ['-h'])
3781 if user_has_setfattr or user_has_xattr:
3783 value = value.decode('utf-8')
3784 if user_has_setfattr:
3785 executable = 'setfattr'
3786 opts = ['-n', key, '-v', value]
3787 elif user_has_xattr:
3788 executable = 'xattr'
3789 opts = ['-w', key, value]
3791 cmd = ([encodeFilename(executable, True)] +
3792 [encodeArgument(o) for o in opts] +
3793 [encodeFilename(path, True)])
3796 p = subprocess.Popen(
3797 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3798 except EnvironmentError as e:
3799 raise XAttrMetadataError(e.errno, e.strerror)
3800 stdout, stderr = p.communicate()
3801 stderr = stderr.decode('utf-8', 'replace')
3802 if p.returncode != 0:
3803 raise XAttrMetadataError(p.returncode, stderr)
3806 # On Unix, and can't find pyxattr, setfattr, or xattr.
3807 if sys.platform.startswith('linux'):
3808 raise XAttrUnavailableError(
3809 "Couldn't find a tool to set the xattrs. "
3810 "Install either the python 'pyxattr' or 'xattr' "
3811 "modules, or the GNU 'attr' package "
3812 "(which contains the 'setfattr' tool).")
3814 raise XAttrUnavailableError(
3815 "Couldn't find a tool to set the xattrs. "
3816 "Install either the python 'xattr' module, "
3817 "or the 'xattr' binary.")
3820 def random_birthday(year_field, month_field, day_field):
3822 year_field: str(random.randint(1950, 1995)),
3823 month_field: str(random.randint(1, 12)),
3824 day_field: str(random.randint(1, 31)),