4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
38 compat_HTMLParseError,
42 compat_ctypes_WINFUNCTYPE,
43 compat_etree_fromstring,
46 compat_html_entities_html5,
52 compat_socket_create_connection,
58 compat_urllib_parse_urlencode,
59 compat_urllib_parse_urlparse,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_request,
72 def register_socks_protocols():
73 # "Register" SOCKS protocols
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
81 # This is not clearly defined otherwise
82 compiled_regex_type = type(re.compile(''))
85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
100 ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
105 'en': ENGLISH_MONTH_NAMES,
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
124 'f4f', 'f4m', 'm3u8', 'smil')
126 # needed for sanitizing filenames in restricted mode
127 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
152 '%Y-%m-%d %H:%M:%S.%f',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
159 '%Y-%m-%dT%H:%M:%S.%f',
162 '%b %d %Y at %H:%M:%S',
164 '%B %d %Y at %H:%M:%S',
167 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
168 DATE_FORMATS_DAY_FIRST.extend([
177 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
178 DATE_FORMATS_MONTH_FIRST.extend([
186 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
189 def preferredencoding():
190 """Get preferred encoding.
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
196 pref = locale.getpreferredencoding()
204 def write_json_file(obj, fn):
205 """ Encode obj as JSON and write it to fn, atomically if possible """
207 fn = encodeFilename(fn)
208 if sys.version_info < (3, 0) and sys.platform != 'win32':
209 encoding = get_filesystem_encoding()
210 # os.path.basename returns a bytes object, but NamedTemporaryFile
211 # will fail if the filename contains non ascii characters unless we
212 # use a unicode object
213 path_basename = lambda f: os.path.basename(fn).decode(encoding)
214 # the same for os.path.dirname
215 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
217 path_basename = os.path.basename
218 path_dirname = os.path.dirname
222 'prefix': path_basename(fn) + '.',
223 'dir': path_dirname(fn),
227 # In Python 2.x, json.dump expects a bytestream.
228 # In Python 3.x, it writes to a character stream
229 if sys.version_info < (3, 0):
237 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
242 if sys.platform == 'win32':
243 # Need to remove existing file on Windows, else os.rename raises
244 # WindowsError or FileExistsError.
249 os.rename(tf.name, fn)
258 if sys.version_info >= (2, 7):
259 def find_xpath_attr(node, xpath, key, val=None):
260 """ Find the xpath xpath[@key=val] """
261 assert re.match(r'^[a-zA-Z_-]+$', key)
262 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
263 return node.find(expr)
265 def find_xpath_attr(node, xpath, key, val=None):
266 for f in node.findall(compat_xpath(xpath)):
267 if key not in f.attrib:
269 if val is None or f.attrib.get(key) == val:
273 # On python2.6 the xml.etree.ElementTree.Element methods don't support
274 # the namespace parameter
277 def xpath_with_ns(path, ns_map):
278 components = [c.split(':') for c in path.split('/')]
282 replaced.append(c[0])
285 replaced.append('{%s}%s' % (ns_map[ns], tag))
286 return '/'.join(replaced)
289 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
290 def _find_xpath(xpath):
291 return node.find(compat_xpath(xpath))
293 if isinstance(xpath, (str, compat_str)):
294 n = _find_xpath(xpath)
302 if default is not NO_DEFAULT:
305 name = xpath if name is None else name
306 raise ExtractorError('Could not find XML element %s' % name)
312 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
313 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
314 if n is None or n == default:
317 if default is not NO_DEFAULT:
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element\'s text %s' % name)
327 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
328 n = find_xpath_attr(node, xpath, key)
330 if default is not NO_DEFAULT:
333 name = '%s[@%s]' % (xpath, key) if name is None else name
334 raise ExtractorError('Could not find XML attribute %s' % name)
340 def get_element_by_id(id, html):
341 """Return the content of the tag with the specified ID in the passed HTML document"""
342 return get_element_by_attribute('id', id, html)
345 def get_element_by_class(class_name, html):
346 """Return the content of the first tag with the specified class in the passed HTML document"""
347 retval = get_elements_by_class(class_name, html)
348 return retval[0] if retval else None
351 def get_element_by_attribute(attribute, value, html, escape_value=True):
352 retval = get_elements_by_attribute(attribute, value, html, escape_value)
353 return retval[0] if retval else None
356 def get_elements_by_class(class_name, html):
357 """Return the content of all tags with the specified class in the passed HTML document as a list"""
358 return get_elements_by_attribute(
359 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
360 html, escape_value=False)
363 def get_elements_by_attribute(attribute, value, html, escape_value=True):
364 """Return the content of the tag with the specified attribute in the passed HTML document"""
366 value = re.escape(value) if escape_value else value
369 for m in re.finditer(r'''(?xs)
371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
373 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
377 ''' % (re.escape(attribute), value), html):
378 res = m.group('content')
380 if res.startswith('"') or res.startswith("'"):
383 retlist.append(unescapeHTML(res))
388 class HTMLAttributeParser(compat_HTMLParser):
389 """Trivial HTML parser to gather the attributes for a single element"""
392 compat_HTMLParser.__init__(self)
394 def handle_starttag(self, tag, attrs):
395 self.attrs = dict(attrs)
398 def extract_attributes(html_element):
399 """Given a string for an HTML element such as
401 a="foo" B="bar" c="&98;az" d=boz
402 empty= noval entity="&"
405 Decode and return a dictionary of attributes.
407 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
408 'empty': '', 'noval': None, 'entity': '&',
409 'sq': '"', 'dq': '\''
411 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
412 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
414 parser = HTMLAttributeParser()
416 parser.feed(html_element)
418 # Older Python may throw HTMLParseError in case of malformed HTML
419 except compat_HTMLParseError:
424 def clean_html(html):
425 """Clean an HTML snippet into a readable string"""
427 if html is None: # Convenience for sanitizing descriptions etc.
431 html = html.replace('\n', ' ')
432 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
433 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
435 html = re.sub('<.*?>', '', html)
436 # Replace html entities
437 html = unescapeHTML(html)
441 def sanitize_open(filename, open_mode):
442 """Try to open the given filename, and slightly tweak it if this fails.
444 Attempts to open the given filename. If this fails, it tries to change
445 the filename slightly, step by step, until it's either able to open it
446 or it fails and raises a final exception, like the standard open()
449 It returns the tuple (stream, definitive_file_name).
453 if sys.platform == 'win32':
455 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
456 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
457 stream = open(encodeFilename(filename), open_mode)
458 return (stream, filename)
459 except (IOError, OSError) as err:
460 if err.errno in (errno.EACCES,):
463 # In case of error, try to remove win32 forbidden chars
464 alt_filename = sanitize_path(filename)
465 if alt_filename == filename:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(alt_filename), open_mode)
470 return (stream, alt_filename)
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
482 def sanitize_filename(s, restricted=False, is_id=False):
483 """Sanitizes a string so it could be used as part of a filename.
484 If restricted is set, use a stricter subset of allowed characters.
485 Set is_id if this is not an arbitrary string, but an ID that should be kept
488 def replace_insane(char):
489 if restricted and char in ACCENT_CHARS:
490 return ACCENT_CHARS[char]
491 if char == '?' or ord(char) < 32 or ord(char) == 127:
494 return '' if restricted else '\''
496 return '_-' if restricted else ' -'
497 elif char in '\\/|*<>':
499 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
501 if restricted and ord(char) > 127:
506 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
507 result = ''.join(map(replace_insane, s))
509 while '__' in result:
510 result = result.replace('__', '_')
511 result = result.strip('_')
512 # Common case of "Foreign band name - English song title"
513 if restricted and result.startswith('-_'):
515 if result.startswith('-'):
516 result = '_' + result[len('-'):]
517 result = result.lstrip('.')
523 def sanitize_path(s):
524 """Sanitizes and normalizes path on Windows"""
525 if sys.platform != 'win32':
527 drive_or_unc, _ = os.path.splitdrive(s)
528 if sys.version_info < (2, 7) and not drive_or_unc:
529 drive_or_unc, _ = os.path.splitunc(s)
530 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
534 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
535 for path_part in norm_path]
537 sanitized_path.insert(0, drive_or_unc + os.path.sep)
538 return os.path.join(*sanitized_path)
541 def sanitize_url(url):
542 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
543 # the number of unwanted failures due to missing protocol
544 if url.startswith('//'):
545 return 'http:%s' % url
546 # Fix some common typos seen so far
548 # https://github.com/rg3/youtube-dl/issues/15649
549 (r'^httpss://', r'https://'),
550 # https://bx1.be/lives/direct-tv/
551 (r'^rmtp([es]?)://', r'rtmp\1://'),
553 for mistake, fixup in COMMON_TYPOS:
554 if re.match(mistake, url):
555 return re.sub(mistake, fixup, url)
559 def sanitized_Request(url, *args, **kwargs):
560 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
564 """Expand shell variables and ~"""
565 return os.path.expandvars(compat_expanduser(s))
568 def orderedSet(iterable):
569 """ Remove all duplicates from the input iterable """
577 def _htmlentity_transform(entity_with_semicolon):
578 """Transforms an HTML entity to a character."""
579 entity = entity_with_semicolon[:-1]
581 # Known non-numeric HTML entity
582 if entity in compat_html_entities.name2codepoint:
583 return compat_chr(compat_html_entities.name2codepoint[entity])
585 # TODO: HTML5 allows entities without a semicolon. For example,
586 # 'Éric' should be decoded as 'Éric'.
587 if entity_with_semicolon in compat_html_entities_html5:
588 return compat_html_entities_html5[entity_with_semicolon]
590 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
592 numstr = mobj.group(1)
593 if numstr.startswith('x'):
595 numstr = '0%s' % numstr
598 # See https://github.com/rg3/youtube-dl/issues/7518
600 return compat_chr(int(numstr, base))
604 # Unknown entity in name, return its literal representation
605 return '&%s;' % entity
611 assert type(s) == compat_str
614 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
617 def get_subprocess_encoding():
618 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
619 # For subprocess calls, encode with locale encoding
620 # Refer to http://stackoverflow.com/a/9951851/35070
621 encoding = preferredencoding()
623 encoding = sys.getfilesystemencoding()
629 def encodeFilename(s, for_subprocess=False):
631 @param s The name of the file
634 assert type(s) == compat_str
636 # Python 3 has a Unicode API
637 if sys.version_info >= (3, 0):
640 # Pass '' directly to use Unicode APIs on Windows 2000 and up
641 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
642 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
643 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
646 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
647 if sys.platform.startswith('java'):
650 return s.encode(get_subprocess_encoding(), 'ignore')
653 def decodeFilename(b, for_subprocess=False):
655 if sys.version_info >= (3, 0):
658 if not isinstance(b, bytes):
661 return b.decode(get_subprocess_encoding(), 'ignore')
664 def encodeArgument(s):
665 if not isinstance(s, compat_str):
666 # Legacy code that uses byte strings
667 # Uncomment the following line after fixing all post processors
668 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
669 s = s.decode('ascii')
670 return encodeFilename(s, True)
673 def decodeArgument(b):
674 return decodeFilename(b, True)
677 def decodeOption(optval):
680 if isinstance(optval, bytes):
681 optval = optval.decode(preferredencoding())
683 assert isinstance(optval, compat_str)
687 def formatSeconds(secs):
689 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
691 return '%d:%02d' % (secs // 60, secs % 60)
696 def make_HTTPS_handler(params, **kwargs):
697 opts_no_check_certificate = params.get('nocheckcertificate', False)
698 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
699 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
700 if opts_no_check_certificate:
701 context.check_hostname = False
702 context.verify_mode = ssl.CERT_NONE
704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
707 # (create_default_context present but HTTPSHandler has no context=)
710 if sys.version_info < (3, 2):
711 return YoutubeDLHTTPSHandler(params, **kwargs)
713 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
714 context.verify_mode = (ssl.CERT_NONE
715 if opts_no_check_certificate
716 else ssl.CERT_REQUIRED)
717 context.set_default_verify_paths()
718 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
721 def bug_reports_message():
722 if ytdl_is_updateable():
723 update_cmd = 'type youtube-dl -U to update'
725 update_cmd = 'see https://yt-dl.org/update on how to update'
726 msg = '; please report this issue on https://yt-dl.org/bug .'
727 msg += ' Make sure you are using the latest version; %s.' % update_cmd
728 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
732 class YoutubeDLError(Exception):
733 """Base exception for YoutubeDL errors."""
737 class ExtractorError(YoutubeDLError):
738 """Error during info extraction."""
740 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
741 """ tb, if given, is the original traceback (so that it can be printed out).
742 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
745 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
747 if video_id is not None:
748 msg = video_id + ': ' + msg
750 msg += ' (caused by %r)' % cause
752 msg += bug_reports_message()
753 super(ExtractorError, self).__init__(msg)
756 self.exc_info = sys.exc_info() # preserve original exception
758 self.video_id = video_id
760 def format_traceback(self):
761 if self.traceback is None:
763 return ''.join(traceback.format_tb(self.traceback))
766 class UnsupportedError(ExtractorError):
767 def __init__(self, url):
768 super(UnsupportedError, self).__init__(
769 'Unsupported URL: %s' % url, expected=True)
773 class RegexNotFoundError(ExtractorError):
774 """Error when a regex didn't match"""
778 class GeoRestrictedError(ExtractorError):
779 """Geographic restriction Error exception.
781 This exception may be thrown when a video is not available from your
782 geographic location due to geographic restrictions imposed by a website.
784 def __init__(self, msg, countries=None):
785 super(GeoRestrictedError, self).__init__(msg, expected=True)
787 self.countries = countries
790 class DownloadError(YoutubeDLError):
791 """Download Error exception.
793 This exception may be thrown by FileDownloader objects if they are not
794 configured to continue on errors. They will contain the appropriate
798 def __init__(self, msg, exc_info=None):
799 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
800 super(DownloadError, self).__init__(msg)
801 self.exc_info = exc_info
804 class SameFileError(YoutubeDLError):
805 """Same File exception.
807 This exception will be thrown by FileDownloader objects if they detect
808 multiple files would have to be downloaded to the same file on disk.
813 class PostProcessingError(YoutubeDLError):
814 """Post Processing exception.
816 This exception may be raised by PostProcessor's .run() method to
817 indicate an error in the postprocessing task.
820 def __init__(self, msg):
821 super(PostProcessingError, self).__init__(msg)
825 class MaxDownloadsReached(YoutubeDLError):
826 """ --max-downloads limit has been reached. """
830 class UnavailableVideoError(YoutubeDLError):
831 """Unavailable Format exception.
833 This exception will be thrown when a video is requested
834 in a format that is not available for that video.
839 class ContentTooShortError(YoutubeDLError):
840 """Content Too Short exception.
842 This exception may be raised by FileDownloader objects when a file they
843 download is too small for what the server announced first, indicating
844 the connection was probably interrupted.
847 def __init__(self, downloaded, expected):
848 super(ContentTooShortError, self).__init__(
849 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
852 self.downloaded = downloaded
853 self.expected = expected
856 class XAttrMetadataError(YoutubeDLError):
857 def __init__(self, code=None, msg='Unknown error'):
858 super(XAttrMetadataError, self).__init__(msg)
862 # Parsing code and msg
863 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
864 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
865 self.reason = 'NO_SPACE'
866 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
867 self.reason = 'VALUE_TOO_LONG'
869 self.reason = 'NOT_SUPPORTED'
872 class XAttrUnavailableError(YoutubeDLError):
876 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
877 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
878 # expected HTTP responses to meet HTTP/1.0 or later (see also
879 # https://github.com/rg3/youtube-dl/issues/6727)
880 if sys.version_info < (3, 0):
881 kwargs['strict'] = True
882 hc = http_class(*args, **compat_kwargs(kwargs))
883 source_address = ydl_handler._params.get('source_address')
884 if source_address is not None:
885 sa = (source_address, 0)
886 if hasattr(hc, 'source_address'): # Python 2.7+
887 hc.source_address = sa
889 def _hc_connect(self, *args, **kwargs):
890 sock = compat_socket_create_connection(
891 (self.host, self.port), self.timeout, sa)
893 self.sock = ssl.wrap_socket(
894 sock, self.key_file, self.cert_file,
895 ssl_version=ssl.PROTOCOL_TLSv1)
898 hc.connect = functools.partial(_hc_connect, hc)
903 def handle_youtubedl_headers(headers):
904 filtered_headers = headers
906 if 'Youtubedl-no-compression' in filtered_headers:
907 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
908 del filtered_headers['Youtubedl-no-compression']
910 return filtered_headers
913 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
914 """Handler for HTTP requests and responses.
916 This class, when installed with an OpenerDirector, automatically adds
917 the standard headers to every HTTP request and handles gzipped and
918 deflated responses from web servers. If compression is to be avoided in
919 a particular request, the original request in the program code only has
920 to include the HTTP header "Youtubedl-no-compression", which will be
921 removed before making the real request.
923 Part of this code was copied from:
925 http://techknack.net/python-urllib2-handlers/
927 Andrew Rowls, the author of that code, agreed to release it to the
931 def __init__(self, params, *args, **kwargs):
932 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
933 self._params = params
935 def http_open(self, req):
936 conn_class = compat_http_client.HTTPConnection
938 socks_proxy = req.headers.get('Ytdl-socks-proxy')
940 conn_class = make_socks_conn_class(conn_class, socks_proxy)
941 del req.headers['Ytdl-socks-proxy']
943 return self.do_open(functools.partial(
944 _create_http_connection, self, conn_class, False),
950 return zlib.decompress(data, -zlib.MAX_WBITS)
952 return zlib.decompress(data)
954 def http_request(self, req):
955 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
956 # always respected by websites, some tend to give out URLs with non percent-encoded
957 # non-ASCII characters (see telemb.py, ard.py [#3412])
958 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
959 # To work around aforementioned issue we will replace request's original URL with
960 # percent-encoded one
961 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
962 # the code of this workaround has been moved here from YoutubeDL.urlopen()
963 url = req.get_full_url()
964 url_escaped = escape_url(url)
966 # Substitute URL if any change after escaping
967 if url != url_escaped:
968 req = update_Request(req, url=url_escaped)
970 for h, v in std_headers.items():
971 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
972 # The dict keys are capitalized because of this bug by urllib
973 if h.capitalize() not in req.headers:
976 req.headers = handle_youtubedl_headers(req.headers)
978 if sys.version_info < (2, 7) and '#' in req.get_full_url():
979 # Python 2.6 is brain-dead when it comes to fragments
980 req._Request__original = req._Request__original.partition('#')[0]
981 req._Request__r_type = req._Request__r_type.partition('#')[0]
985 def http_response(self, req, resp):
988 if resp.headers.get('Content-encoding', '') == 'gzip':
989 content = resp.read()
990 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
992 uncompressed = io.BytesIO(gz.read())
993 except IOError as original_ioerror:
994 # There may be junk add the end of the file
995 # See http://stackoverflow.com/q/4928560/35070 for details
996 for i in range(1, 1024):
998 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
999 uncompressed = io.BytesIO(gz.read())
1004 raise original_ioerror
1005 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1006 resp.msg = old_resp.msg
1007 del resp.headers['Content-encoding']
1009 if resp.headers.get('Content-encoding', '') == 'deflate':
1010 gz = io.BytesIO(self.deflate(resp.read()))
1011 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1012 resp.msg = old_resp.msg
1013 del resp.headers['Content-encoding']
1014 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1015 # https://github.com/rg3/youtube-dl/issues/6457).
1016 if 300 <= resp.code < 400:
1017 location = resp.headers.get('Location')
1019 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1020 if sys.version_info >= (3, 0):
1021 location = location.encode('iso-8859-1').decode('utf-8')
1023 location = location.decode('utf-8')
1024 location_escaped = escape_url(location)
1025 if location != location_escaped:
1026 del resp.headers['Location']
1027 if sys.version_info < (3, 0):
1028 location_escaped = location_escaped.encode('utf-8')
1029 resp.headers['Location'] = location_escaped
1032 https_request = http_request
1033 https_response = http_response
1036 def make_socks_conn_class(base_class, socks_proxy):
1037 assert issubclass(base_class, (
1038 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1040 url_components = compat_urlparse.urlparse(socks_proxy)
1041 if url_components.scheme.lower() == 'socks5':
1042 socks_type = ProxyType.SOCKS5
1043 elif url_components.scheme.lower() in ('socks', 'socks4'):
1044 socks_type = ProxyType.SOCKS4
1045 elif url_components.scheme.lower() == 'socks4a':
1046 socks_type = ProxyType.SOCKS4A
1048 def unquote_if_non_empty(s):
1051 return compat_urllib_parse_unquote_plus(s)
1055 url_components.hostname, url_components.port or 1080,
1057 unquote_if_non_empty(url_components.username),
1058 unquote_if_non_empty(url_components.password),
1061 class SocksConnection(base_class):
1063 self.sock = sockssocket()
1064 self.sock.setproxy(*proxy_args)
1065 if type(self.timeout) in (int, float):
1066 self.sock.settimeout(self.timeout)
1067 self.sock.connect((self.host, self.port))
1069 if isinstance(self, compat_http_client.HTTPSConnection):
1070 if hasattr(self, '_context'): # Python > 2.6
1071 self.sock = self._context.wrap_socket(
1072 self.sock, server_hostname=self.host)
1074 self.sock = ssl.wrap_socket(self.sock)
1076 return SocksConnection
1079 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1080 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1081 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1082 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1083 self._params = params
1085 def https_open(self, req):
1087 conn_class = self._https_conn_class
1089 if hasattr(self, '_context'): # python > 2.6
1090 kwargs['context'] = self._context
1091 if hasattr(self, '_check_hostname'): # python 3.x
1092 kwargs['check_hostname'] = self._check_hostname
1094 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1096 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1097 del req.headers['Ytdl-socks-proxy']
1099 return self.do_open(functools.partial(
1100 _create_http_connection, self, conn_class, True),
1104 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1105 def __init__(self, cookiejar=None):
1106 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1108 def http_response(self, request, response):
1109 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1110 # characters in Set-Cookie HTTP header of last response (see
1111 # https://github.com/rg3/youtube-dl/issues/6769).
1112 # In order to at least prevent crashing we will percent encode Set-Cookie
1113 # header before HTTPCookieProcessor starts processing it.
1114 # if sys.version_info < (3, 0) and response.headers:
1115 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1116 # set_cookie = response.headers.get(set_cookie_header)
1118 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1119 # if set_cookie != set_cookie_escaped:
1120 # del response.headers[set_cookie_header]
1121 # response.headers[set_cookie_header] = set_cookie_escaped
1122 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1124 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1125 https_response = http_response
1128 def extract_timezone(date_str):
1130 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1133 timezone = datetime.timedelta()
1135 date_str = date_str[:-len(m.group('tz'))]
1136 if not m.group('sign'):
1137 timezone = datetime.timedelta()
1139 sign = 1 if m.group('sign') == '+' else -1
1140 timezone = datetime.timedelta(
1141 hours=sign * int(m.group('hours')),
1142 minutes=sign * int(m.group('minutes')))
1143 return timezone, date_str
1146 def parse_iso8601(date_str, delimiter='T', timezone=None):
1147 """ Return a UNIX timestamp from the given date """
1149 if date_str is None:
1152 date_str = re.sub(r'\.[0-9]+', '', date_str)
1154 if timezone is None:
1155 timezone, date_str = extract_timezone(date_str)
1158 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1159 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1160 return calendar.timegm(dt.timetuple())
1165 def date_formats(day_first=True):
1166 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1169 def unified_strdate(date_str, day_first=True):
1170 """Return a string with the date in the format YYYYMMDD"""
1172 if date_str is None:
1176 date_str = date_str.replace(',', ' ')
1177 # Remove AM/PM + timezone
1178 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1179 _, date_str = extract_timezone(date_str)
1181 for expression in date_formats(day_first):
1183 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1186 if upload_date is None:
1187 timetuple = email.utils.parsedate_tz(date_str)
1190 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1193 if upload_date is not None:
1194 return compat_str(upload_date)
1197 def unified_timestamp(date_str, day_first=True):
1198 if date_str is None:
1201 date_str = re.sub(r'[,|]', '', date_str)
1203 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1204 timezone, date_str = extract_timezone(date_str)
1206 # Remove AM/PM + timezone
1207 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1209 # Remove unrecognized timezones from ISO 8601 alike timestamps
1210 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1212 date_str = date_str[:-len(m.group('tz'))]
1214 for expression in date_formats(day_first):
1216 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1217 return calendar.timegm(dt.timetuple())
1220 timetuple = email.utils.parsedate_tz(date_str)
1222 return calendar.timegm(timetuple) + pm_delta * 3600
1225 def determine_ext(url, default_ext='unknown_video'):
1228 guess = url.partition('?')[0].rpartition('.')[2]
1229 if re.match(r'^[A-Za-z0-9]+$', guess):
1231 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1232 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1233 return guess.rstrip('/')
1238 def subtitles_filename(filename, sub_lang, sub_format):
1239 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1242 def date_from_str(date_str):
1244 Return a datetime object from a string in the format YYYYMMDD or
1245 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1246 today = datetime.date.today()
1247 if date_str in ('now', 'today'):
1249 if date_str == 'yesterday':
1250 return today - datetime.timedelta(days=1)
1251 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1252 if match is not None:
1253 sign = match.group('sign')
1254 time = int(match.group('time'))
1257 unit = match.group('unit')
1258 # A bad approximation?
1262 elif unit == 'year':
1266 delta = datetime.timedelta(**{unit: time})
1267 return today + delta
1268 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1271 def hyphenate_date(date_str):
1273 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1274 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1275 if match is not None:
1276 return '-'.join(match.groups())
1281 class DateRange(object):
1282 """Represents a time interval between two dates"""
1284 def __init__(self, start=None, end=None):
1285 """start and end must be strings in the format accepted by date"""
1286 if start is not None:
1287 self.start = date_from_str(start)
1289 self.start = datetime.datetime.min.date()
1291 self.end = date_from_str(end)
1293 self.end = datetime.datetime.max.date()
1294 if self.start > self.end:
1295 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1299 """Returns a range that only contains the given day"""
1300 return cls(day, day)
1302 def __contains__(self, date):
1303 """Check if the date is in the range"""
1304 if not isinstance(date, datetime.date):
1305 date = date_from_str(date)
1306 return self.start <= date <= self.end
1309 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1312 def platform_name():
1313 """ Returns the platform name as a compat_str """
1314 res = platform.platform()
1315 if isinstance(res, bytes):
1316 res = res.decode(preferredencoding())
1318 assert isinstance(res, compat_str)
1322 def _windows_write_string(s, out):
1323 """ Returns True if the string was written using special methods,
1324 False if it has yet to be written out."""
1325 # Adapted from http://stackoverflow.com/a/3259271/35070
1328 import ctypes.wintypes
1336 fileno = out.fileno()
1337 except AttributeError:
1338 # If the output stream doesn't have a fileno, it's virtual
1340 except io.UnsupportedOperation:
1341 # Some strange Windows pseudo files?
1343 if fileno not in WIN_OUTPUT_IDS:
1346 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1347 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1348 ('GetStdHandle', ctypes.windll.kernel32))
1349 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1351 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1352 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1353 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1354 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1355 written = ctypes.wintypes.DWORD(0)
1357 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1358 FILE_TYPE_CHAR = 0x0002
1359 FILE_TYPE_REMOTE = 0x8000
1360 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1361 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1362 ctypes.POINTER(ctypes.wintypes.DWORD))(
1363 ('GetConsoleMode', ctypes.windll.kernel32))
1364 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1366 def not_a_console(handle):
1367 if handle == INVALID_HANDLE_VALUE or handle is None:
1369 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1370 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1372 if not_a_console(h):
1375 def next_nonbmp_pos(s):
1377 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1378 except StopIteration:
1382 count = min(next_nonbmp_pos(s), 1024)
1384 ret = WriteConsoleW(
1385 h, s, count if count else 2, ctypes.byref(written), None)
1387 raise OSError('Failed to write string')
1388 if not count: # We just wrote a non-BMP character
1389 assert written.value == 2
1392 assert written.value > 0
1393 s = s[written.value:]
1397 def write_string(s, out=None, encoding=None):
1400 assert type(s) == compat_str
1402 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1403 if _windows_write_string(s, out):
1406 if ('b' in getattr(out, 'mode', '') or
1407 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1408 byt = s.encode(encoding or preferredencoding(), 'ignore')
1410 elif hasattr(out, 'buffer'):
1411 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1412 byt = s.encode(enc, 'ignore')
1413 out.buffer.write(byt)
1419 def bytes_to_intlist(bs):
1422 if isinstance(bs[0], int): # Python 3
1425 return [ord(c) for c in bs]
1428 def intlist_to_bytes(xs):
1431 return compat_struct_pack('%dB' % len(xs), *xs)
1434 # Cross-platform file locking
1435 if sys.platform == 'win32':
1436 import ctypes.wintypes
1439 class OVERLAPPED(ctypes.Structure):
1441 ('Internal', ctypes.wintypes.LPVOID),
1442 ('InternalHigh', ctypes.wintypes.LPVOID),
1443 ('Offset', ctypes.wintypes.DWORD),
1444 ('OffsetHigh', ctypes.wintypes.DWORD),
1445 ('hEvent', ctypes.wintypes.HANDLE),
1448 kernel32 = ctypes.windll.kernel32
1449 LockFileEx = kernel32.LockFileEx
1450 LockFileEx.argtypes = [
1451 ctypes.wintypes.HANDLE, # hFile
1452 ctypes.wintypes.DWORD, # dwFlags
1453 ctypes.wintypes.DWORD, # dwReserved
1454 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1455 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1456 ctypes.POINTER(OVERLAPPED) # Overlapped
1458 LockFileEx.restype = ctypes.wintypes.BOOL
1459 UnlockFileEx = kernel32.UnlockFileEx
1460 UnlockFileEx.argtypes = [
1461 ctypes.wintypes.HANDLE, # hFile
1462 ctypes.wintypes.DWORD, # dwReserved
1463 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1464 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1465 ctypes.POINTER(OVERLAPPED) # Overlapped
1467 UnlockFileEx.restype = ctypes.wintypes.BOOL
1468 whole_low = 0xffffffff
1469 whole_high = 0x7fffffff
1471 def _lock_file(f, exclusive):
1472 overlapped = OVERLAPPED()
1473 overlapped.Offset = 0
1474 overlapped.OffsetHigh = 0
1475 overlapped.hEvent = 0
1476 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1477 handle = msvcrt.get_osfhandle(f.fileno())
1478 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1479 whole_low, whole_high, f._lock_file_overlapped_p):
1480 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1482 def _unlock_file(f):
1483 assert f._lock_file_overlapped_p
1484 handle = msvcrt.get_osfhandle(f.fileno())
1485 if not UnlockFileEx(handle, 0,
1486 whole_low, whole_high, f._lock_file_overlapped_p):
1487 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1490 # Some platforms, such as Jython, is missing fcntl
1494 def _lock_file(f, exclusive):
1495 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1497 def _unlock_file(f):
1498 fcntl.flock(f, fcntl.LOCK_UN)
1500 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1502 def _lock_file(f, exclusive):
1503 raise IOError(UNSUPPORTED_MSG)
1505 def _unlock_file(f):
1506 raise IOError(UNSUPPORTED_MSG)
1509 class locked_file(object):
1510 def __init__(self, filename, mode, encoding=None):
1511 assert mode in ['r', 'a', 'w']
1512 self.f = io.open(filename, mode, encoding=encoding)
1515 def __enter__(self):
1516 exclusive = self.mode != 'r'
1518 _lock_file(self.f, exclusive)
1524 def __exit__(self, etype, value, traceback):
1526 _unlock_file(self.f)
1533 def write(self, *args):
1534 return self.f.write(*args)
1536 def read(self, *args):
1537 return self.f.read(*args)
1540 def get_filesystem_encoding():
1541 encoding = sys.getfilesystemencoding()
1542 return encoding if encoding is not None else 'utf-8'
1545 def shell_quote(args):
1547 encoding = get_filesystem_encoding()
1549 if isinstance(a, bytes):
1550 # We may get a filename encoded with 'encodeFilename'
1551 a = a.decode(encoding)
1552 quoted_args.append(compat_shlex_quote(a))
1553 return ' '.join(quoted_args)
1556 def smuggle_url(url, data):
1557 """ Pass additional data in a URL for internal use. """
1559 url, idata = unsmuggle_url(url, {})
1561 sdata = compat_urllib_parse_urlencode(
1562 {'__youtubedl_smuggle': json.dumps(data)})
1563 return url + '#' + sdata
1566 def unsmuggle_url(smug_url, default=None):
1567 if '#__youtubedl_smuggle' not in smug_url:
1568 return smug_url, default
1569 url, _, sdata = smug_url.rpartition('#')
1570 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1571 data = json.loads(jsond)
1575 def format_bytes(bytes):
1578 if type(bytes) is str:
1579 bytes = float(bytes)
1583 exponent = int(math.log(bytes, 1024.0))
1584 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1585 converted = float(bytes) / float(1024 ** exponent)
1586 return '%.2f%s' % (converted, suffix)
1589 def lookup_unit_table(unit_table, s):
1590 units_re = '|'.join(re.escape(u) for u in unit_table)
1592 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1595 num_str = m.group('num').replace(',', '.')
1596 mult = unit_table[m.group('unit')]
1597 return int(float(num_str) * mult)
1600 def parse_filesize(s):
1604 # The lower-case forms are of course incorrect and unofficial,
1605 # but we support those too
1622 'megabytes': 1000 ** 2,
1623 'mebibytes': 1024 ** 2,
1629 'gigabytes': 1000 ** 3,
1630 'gibibytes': 1024 ** 3,
1636 'terabytes': 1000 ** 4,
1637 'tebibytes': 1024 ** 4,
1643 'petabytes': 1000 ** 5,
1644 'pebibytes': 1024 ** 5,
1650 'exabytes': 1000 ** 6,
1651 'exbibytes': 1024 ** 6,
1657 'zettabytes': 1000 ** 7,
1658 'zebibytes': 1024 ** 7,
1664 'yottabytes': 1000 ** 8,
1665 'yobibytes': 1024 ** 8,
1668 return lookup_unit_table(_UNIT_TABLE, s)
1677 if re.match(r'^[\d,.]+$', s):
1678 return str_to_int(s)
1689 return lookup_unit_table(_UNIT_TABLE, s)
1692 def parse_resolution(s):
1696 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
1699 'width': int(mobj.group('w')),
1700 'height': int(mobj.group('h')),
1703 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
1705 return {'height': int(mobj.group(1))}
1707 mobj = re.search(r'\b([48])[kK]\b', s)
1709 return {'height': int(mobj.group(1)) * 540}
1714 def month_by_name(name, lang='en'):
1715 """ Return the number of a month by (locale-independently) English name """
1717 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1720 return month_names.index(name) + 1
1725 def month_by_abbreviation(abbrev):
1726 """ Return the number of a month by (locale-independently) English
1730 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1735 def fix_xml_ampersands(xml_str):
1736 """Replace all the '&' by '&' in XML"""
1738 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1743 def setproctitle(title):
1744 assert isinstance(title, compat_str)
1746 # ctypes in Jython is not complete
1747 # http://bugs.jython.org/issue2148
1748 if sys.platform.startswith('java'):
1752 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1756 # LoadLibrary in Windows Python 2.7.13 only expects
1757 # a bytestring, but since unicode_literals turns
1758 # every string into a unicode string, it fails.
1760 title_bytes = title.encode('utf-8')
1761 buf = ctypes.create_string_buffer(len(title_bytes))
1762 buf.value = title_bytes
1764 libc.prctl(15, buf, 0, 0, 0)
1765 except AttributeError:
1766 return # Strange libc, just skip this
1769 def remove_start(s, start):
1770 return s[len(start):] if s is not None and s.startswith(start) else s
1773 def remove_end(s, end):
1774 return s[:-len(end)] if s is not None and s.endswith(end) else s
1777 def remove_quotes(s):
1778 if s is None or len(s) < 2:
1780 for quote in ('"', "'", ):
1781 if s[0] == quote and s[-1] == quote:
1786 def url_basename(url):
1787 path = compat_urlparse.urlparse(url).path
1788 return path.strip('/').split('/')[-1]
1792 return re.match(r'https?://[^?#&]+/', url).group()
1795 def urljoin(base, path):
1796 if isinstance(path, bytes):
1797 path = path.decode('utf-8')
1798 if not isinstance(path, compat_str) or not path:
1800 if re.match(r'^(?:https?:)?//', path):
1802 if isinstance(base, bytes):
1803 base = base.decode('utf-8')
1804 if not isinstance(base, compat_str) or not re.match(
1805 r'^(?:https?:)?//', base):
1807 return compat_urlparse.urljoin(base, path)
1810 class HEADRequest(compat_urllib_request.Request):
1811 def get_method(self):
1815 class PUTRequest(compat_urllib_request.Request):
1816 def get_method(self):
1820 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1823 v = getattr(v, get_attr, None)
1829 return int(v) * invscale // scale
1834 def str_or_none(v, default=None):
1835 return default if v is None else compat_str(v)
1838 def str_to_int(int_str):
1839 """ A more relaxed version of int_or_none """
1842 int_str = re.sub(r'[,\.\+]', '', int_str)
1846 def float_or_none(v, scale=1, invscale=1, default=None):
1850 return float(v) * invscale / scale
1855 def bool_or_none(v, default=None):
1856 return v if isinstance(v, bool) else default
1859 def strip_or_none(v):
1860 return None if v is None else v.strip()
1863 def parse_duration(s):
1864 if not isinstance(s, compat_basestring):
1869 days, hours, mins, secs, ms = [None] * 5
1870 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1872 days, hours, mins, secs, ms = m.groups()
1877 [0-9]+\s*y(?:ears?)?\s*
1880 [0-9]+\s*m(?:onths?)?\s*
1883 [0-9]+\s*w(?:eeks?)?\s*
1886 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1890 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1893 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1896 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1899 days, hours, mins, secs, ms = m.groups()
1901 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1903 hours, mins = m.groups()
1909 duration += float(secs)
1911 duration += float(mins) * 60
1913 duration += float(hours) * 60 * 60
1915 duration += float(days) * 24 * 60 * 60
1917 duration += float(ms)
1921 def prepend_extension(filename, ext, expected_real_ext=None):
1922 name, real_ext = os.path.splitext(filename)
1924 '{0}.{1}{2}'.format(name, ext, real_ext)
1925 if not expected_real_ext or real_ext[1:] == expected_real_ext
1926 else '{0}.{1}'.format(filename, ext))
1929 def replace_extension(filename, ext, expected_real_ext=None):
1930 name, real_ext = os.path.splitext(filename)
1931 return '{0}.{1}'.format(
1932 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1936 def check_executable(exe, args=[]):
1937 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1938 args can be a list of arguments for a short output (like -version) """
1940 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1946 def get_exe_version(exe, args=['--version'],
1947 version_re=None, unrecognized='present'):
1948 """ Returns the version of the specified executable,
1949 or False if the executable is not present """
1951 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1952 # SIGTTOU if youtube-dl is run in the background.
1953 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1954 out, _ = subprocess.Popen(
1955 [encodeArgument(exe)] + args,
1956 stdin=subprocess.PIPE,
1957 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1960 if isinstance(out, bytes): # Python 2.x
1961 out = out.decode('ascii', 'ignore')
1962 return detect_exe_version(out, version_re, unrecognized)
1965 def detect_exe_version(output, version_re=None, unrecognized='present'):
1966 assert isinstance(output, compat_str)
1967 if version_re is None:
1968 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1969 m = re.search(version_re, output)
1976 class PagedList(object):
1978 # This is only useful for tests
1979 return len(self.getslice())
1982 class OnDemandPagedList(PagedList):
1983 def __init__(self, pagefunc, pagesize, use_cache=True):
1984 self._pagefunc = pagefunc
1985 self._pagesize = pagesize
1986 self._use_cache = use_cache
1990 def getslice(self, start=0, end=None):
1992 for pagenum in itertools.count(start // self._pagesize):
1993 firstid = pagenum * self._pagesize
1994 nextfirstid = pagenum * self._pagesize + self._pagesize
1995 if start >= nextfirstid:
2000 page_results = self._cache.get(pagenum)
2001 if page_results is None:
2002 page_results = list(self._pagefunc(pagenum))
2004 self._cache[pagenum] = page_results
2007 start % self._pagesize
2008 if firstid <= start < nextfirstid
2012 ((end - 1) % self._pagesize) + 1
2013 if (end is not None and firstid <= end <= nextfirstid)
2016 if startv != 0 or endv is not None:
2017 page_results = page_results[startv:endv]
2018 res.extend(page_results)
2020 # A little optimization - if current page is not "full", ie. does
2021 # not contain page_size videos then we can assume that this page
2022 # is the last one - there are no more ids on further pages -
2023 # i.e. no need to query again.
2024 if len(page_results) + startv < self._pagesize:
2027 # If we got the whole page, but the next page is not interesting,
2028 # break out early as well
2029 if end == nextfirstid:
2034 class InAdvancePagedList(PagedList):
2035 def __init__(self, pagefunc, pagecount, pagesize):
2036 self._pagefunc = pagefunc
2037 self._pagecount = pagecount
2038 self._pagesize = pagesize
2040 def getslice(self, start=0, end=None):
2042 start_page = start // self._pagesize
2044 self._pagecount if end is None else (end // self._pagesize + 1))
2045 skip_elems = start - start_page * self._pagesize
2046 only_more = None if end is None else end - start
2047 for pagenum in range(start_page, end_page):
2048 page = list(self._pagefunc(pagenum))
2050 page = page[skip_elems:]
2052 if only_more is not None:
2053 if len(page) < only_more:
2054 only_more -= len(page)
2056 page = page[:only_more]
2063 def uppercase_escape(s):
2064 unicode_escape = codecs.getdecoder('unicode_escape')
2066 r'\\U[0-9a-fA-F]{8}',
2067 lambda m: unicode_escape(m.group(0))[0],
2071 def lowercase_escape(s):
2072 unicode_escape = codecs.getdecoder('unicode_escape')
2074 r'\\u[0-9a-fA-F]{4}',
2075 lambda m: unicode_escape(m.group(0))[0],
2079 def escape_rfc3986(s):
2080 """Escape non-ASCII characters as suggested by RFC 3986"""
2081 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2082 s = s.encode('utf-8')
2083 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2086 def escape_url(url):
2087 """Escape URL as suggested by RFC 3986"""
2088 url_parsed = compat_urllib_parse_urlparse(url)
2089 return url_parsed._replace(
2090 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2091 path=escape_rfc3986(url_parsed.path),
2092 params=escape_rfc3986(url_parsed.params),
2093 query=escape_rfc3986(url_parsed.query),
2094 fragment=escape_rfc3986(url_parsed.fragment)
2098 def read_batch_urls(batch_fd):
2100 if not isinstance(url, compat_str):
2101 url = url.decode('utf-8', 'replace')
2102 BOM_UTF8 = '\xef\xbb\xbf'
2103 if url.startswith(BOM_UTF8):
2104 url = url[len(BOM_UTF8):]
2106 if url.startswith(('#', ';', ']')):
2110 with contextlib.closing(batch_fd) as fd:
2111 return [url for url in map(fixup, fd) if url]
2114 def urlencode_postdata(*args, **kargs):
2115 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2118 def update_url_query(url, query):
2121 parsed_url = compat_urlparse.urlparse(url)
2122 qs = compat_parse_qs(parsed_url.query)
2124 return compat_urlparse.urlunparse(parsed_url._replace(
2125 query=compat_urllib_parse_urlencode(qs, True)))
2128 def update_Request(req, url=None, data=None, headers={}, query={}):
2129 req_headers = req.headers.copy()
2130 req_headers.update(headers)
2131 req_data = data or req.data
2132 req_url = update_url_query(url or req.get_full_url(), query)
2133 req_get_method = req.get_method()
2134 if req_get_method == 'HEAD':
2135 req_type = HEADRequest
2136 elif req_get_method == 'PUT':
2137 req_type = PUTRequest
2139 req_type = compat_urllib_request.Request
2141 req_url, data=req_data, headers=req_headers,
2142 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2143 if hasattr(req, 'timeout'):
2144 new_req.timeout = req.timeout
2148 def _multipart_encode_impl(data, boundary):
2149 content_type = 'multipart/form-data; boundary=%s' % boundary
2152 for k, v in data.items():
2153 out += b'--' + boundary.encode('ascii') + b'\r\n'
2154 if isinstance(k, compat_str):
2155 k = k.encode('utf-8')
2156 if isinstance(v, compat_str):
2157 v = v.encode('utf-8')
2158 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2159 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2160 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2161 if boundary.encode('ascii') in content:
2162 raise ValueError('Boundary overlaps with data')
2165 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2167 return out, content_type
2170 def multipart_encode(data, boundary=None):
2172 Encode a dict to RFC 7578-compliant form-data
2175 A dict where keys and values can be either Unicode or bytes-like
2178 If specified a Unicode object, it's used as the boundary. Otherwise
2179 a random boundary is generated.
2181 Reference: https://tools.ietf.org/html/rfc7578
2183 has_specified_boundary = boundary is not None
2186 if boundary is None:
2187 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2190 out, content_type = _multipart_encode_impl(data, boundary)
2193 if has_specified_boundary:
2197 return out, content_type
2200 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2201 if isinstance(key_or_keys, (list, tuple)):
2202 for key in key_or_keys:
2203 if key not in d or d[key] is None or skip_false_values and not d[key]:
2207 return d.get(key_or_keys, default)
2210 def try_get(src, getter, expected_type=None):
2211 if not isinstance(getter, (list, tuple)):
2216 except (AttributeError, KeyError, TypeError, IndexError):
2219 if expected_type is None or isinstance(v, expected_type):
2223 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2224 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2236 TV_PARENTAL_GUIDELINES = {
2246 def parse_age_limit(s):
2248 return s if 0 <= s <= 21 else None
2249 if not isinstance(s, compat_basestring):
2251 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2253 return int(m.group('age'))
2255 return US_RATINGS[s]
2256 return TV_PARENTAL_GUIDELINES.get(s)
2259 def strip_jsonp(code):
2262 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2263 (?:\s*&&\s*(?P=func_name))?
2264 \s*\(\s*(?P<callback_data>.*)\);?
2265 \s*?(?://[^\n]*)*$''',
2266 r'\g<callback_data>', code)
2269 def js_to_json(code):
2270 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2271 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2273 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2274 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2279 if v in ('true', 'false', 'null'):
2281 elif v.startswith('/*') or v.startswith('//') or v == ',':
2284 if v[0] in ("'", '"'):
2285 v = re.sub(r'(?s)\\.|"', lambda m: {
2290 }.get(m.group(0), m.group(0)), v[1:-1])
2292 for regex, base in INTEGER_TABLE:
2293 im = re.match(regex, v)
2295 i = int(im.group(1), base)
2296 return '"%d":' % i if v.endswith(':') else '%d' % i
2300 return re.sub(r'''(?sx)
2301 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2302 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2303 {comment}|,(?={skip}[\]}}])|
2304 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
2305 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2307 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2310 def qualities(quality_ids):
2311 """ Get a numeric quality value out of a list of possible values """
2314 return quality_ids.index(qid)
2320 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2323 def limit_length(s, length):
2324 """ Add ellipses to overly long strings """
2329 return s[:length - len(ELLIPSES)] + ELLIPSES
2333 def version_tuple(v):
2334 return tuple(int(e) for e in re.split(r'[-.]', v))
2337 def is_outdated_version(version, limit, assume_new=True):
2339 return not assume_new
2341 return version_tuple(version) < version_tuple(limit)
2343 return not assume_new
2346 def ytdl_is_updateable():
2347 """ Returns if youtube-dl can be updated with -U """
2348 from zipimport import zipimporter
2350 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2353 def args_to_str(args):
2354 # Get a short string representation for a subprocess command
2355 return ' '.join(compat_shlex_quote(a) for a in args)
2358 def error_to_compat_str(err):
2360 # On python 2 error byte string must be decoded with proper
2361 # encoding rather than ascii
2362 if sys.version_info[0] < 3:
2363 err_str = err_str.decode(preferredencoding())
2367 def mimetype2ext(mt):
2373 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2374 # it's the most popular one
2375 'audio/mpeg': 'mp3',
2380 _, _, res = mt.rpartition('/')
2381 res = res.split(';')[0].strip().lower()
2385 'smptett+xml': 'tt',
2389 'x-mp4-fragmented': 'mp4',
2390 'x-ms-sami': 'sami',
2393 'x-mpegurl': 'm3u8',
2394 'vnd.apple.mpegurl': 'm3u8',
2398 'vnd.ms-sstr+xml': 'ism',
2404 def parse_codecs(codecs_str):
2405 # http://tools.ietf.org/html/rfc6381
2408 splited_codecs = list(filter(None, map(
2409 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2410 vcodec, acodec = None, None
2411 for full_codec in splited_codecs:
2412 codec = full_codec.split('.')[0]
2413 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
2416 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2420 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
2421 if not vcodec and not acodec:
2422 if len(splited_codecs) == 2:
2427 elif len(splited_codecs) == 1:
2434 'vcodec': vcodec or 'none',
2435 'acodec': acodec or 'none',
2440 def urlhandle_detect_ext(url_handle):
2441 getheader = url_handle.headers.get
2443 cd = getheader('Content-Disposition')
2445 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2447 e = determine_ext(m.group('filename'), default_ext=None)
2451 return mimetype2ext(getheader('Content-Type'))
2454 def encode_data_uri(data, mime_type):
2455 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2458 def age_restricted(content_limit, age_limit):
2459 """ Returns True iff the content should be blocked """
2461 if age_limit is None: # No limit set
2463 if content_limit is None:
2464 return False # Content available for everyone
2465 return age_limit < content_limit
2468 def is_html(first_bytes):
2469 """ Detect whether a file contains HTML by examining its first bytes. """
2472 (b'\xef\xbb\xbf', 'utf-8'),
2473 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2474 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2475 (b'\xff\xfe', 'utf-16-le'),
2476 (b'\xfe\xff', 'utf-16-be'),
2478 for bom, enc in BOMS:
2479 if first_bytes.startswith(bom):
2480 s = first_bytes[len(bom):].decode(enc, 'replace')
2483 s = first_bytes.decode('utf-8', 'replace')
2485 return re.match(r'^\s*<', s)
2488 def determine_protocol(info_dict):
2489 protocol = info_dict.get('protocol')
2490 if protocol is not None:
2493 url = info_dict['url']
2494 if url.startswith('rtmp'):
2496 elif url.startswith('mms'):
2498 elif url.startswith('rtsp'):
2501 ext = determine_ext(url)
2507 return compat_urllib_parse_urlparse(url).scheme
2510 def render_table(header_row, data):
2511 """ Render a list of rows, each as a list of values """
2512 table = [header_row] + data
2513 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2514 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2515 return '\n'.join(format_str % tuple(row) for row in table)
2518 def _match_one(filter_part, dct):
2519 COMPARISON_OPERATORS = {
2527 operator_rex = re.compile(r'''(?x)\s*
2529 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2531 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2532 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
2533 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2536 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2537 m = operator_rex.search(filter_part)
2539 op = COMPARISON_OPERATORS[m.group('op')]
2540 actual_value = dct.get(m.group('key'))
2541 if (m.group('quotedstrval') is not None or
2542 m.group('strval') is not None or
2543 # If the original field is a string and matching comparisonvalue is
2544 # a number we should respect the origin of the original field
2545 # and process comparison value as a string (see
2546 # https://github.com/rg3/youtube-dl/issues/11082).
2547 actual_value is not None and m.group('intval') is not None and
2548 isinstance(actual_value, compat_str)):
2549 if m.group('op') not in ('=', '!='):
2551 'Operator %s does not support string values!' % m.group('op'))
2552 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2553 quote = m.group('quote')
2554 if quote is not None:
2555 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
2558 comparison_value = int(m.group('intval'))
2560 comparison_value = parse_filesize(m.group('intval'))
2561 if comparison_value is None:
2562 comparison_value = parse_filesize(m.group('intval') + 'B')
2563 if comparison_value is None:
2565 'Invalid integer value %r in filter part %r' % (
2566 m.group('intval'), filter_part))
2567 if actual_value is None:
2568 return m.group('none_inclusive')
2569 return op(actual_value, comparison_value)
2572 '': lambda v: v is not None,
2573 '!': lambda v: v is None,
2575 operator_rex = re.compile(r'''(?x)\s*
2576 (?P<op>%s)\s*(?P<key>[a-z_]+)
2578 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2579 m = operator_rex.search(filter_part)
2581 op = UNARY_OPERATORS[m.group('op')]
2582 actual_value = dct.get(m.group('key'))
2583 return op(actual_value)
2585 raise ValueError('Invalid filter part %r' % filter_part)
2588 def match_str(filter_str, dct):
2589 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2592 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2595 def match_filter_func(filter_str):
2596 def _match_func(info_dict):
2597 if match_str(filter_str, info_dict):
2600 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2601 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2605 def parse_dfxp_time_expr(time_expr):
2609 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2611 return float(mobj.group('time_offset'))
2613 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2615 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2618 def srt_subtitles_timecode(seconds):
2619 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2622 def dfxp2srt(dfxp_data):
2624 @param dfxp_data A bytes-like object containing DFXP data
2625 @returns A unicode object containing converted SRT data
2627 LEGACY_NAMESPACES = (
2628 (b'http://www.w3.org/ns/ttml', [
2629 b'http://www.w3.org/2004/11/ttaf1',
2630 b'http://www.w3.org/2006/04/ttaf1',
2631 b'http://www.w3.org/2006/10/ttaf1',
2633 (b'http://www.w3.org/ns/ttml#styling', [
2634 b'http://www.w3.org/ns/ttml#style',
2638 SUPPORTED_STYLING = [
2647 _x = functools.partial(xpath_with_ns, ns_map={
2648 'ttml': 'http://www.w3.org/ns/ttml',
2649 'tts': 'http://www.w3.org/ns/ttml#styling',
2655 class TTMLPElementParser(object):
2657 _unclosed_elements = []
2658 _applied_styles = []
2660 def start(self, tag, attrib):
2661 if tag in (_x('ttml:br'), 'br'):
2664 unclosed_elements = []
2666 element_style_id = attrib.get('style')
2668 style.update(default_style)
2669 if element_style_id:
2670 style.update(styles.get(element_style_id, {}))
2671 for prop in SUPPORTED_STYLING:
2672 prop_val = attrib.get(_x('tts:' + prop))
2674 style[prop] = prop_val
2677 for k, v in sorted(style.items()):
2678 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2681 font += ' color="%s"' % v
2682 elif k == 'fontSize':
2683 font += ' size="%s"' % v
2684 elif k == 'fontFamily':
2685 font += ' face="%s"' % v
2686 elif k == 'fontWeight' and v == 'bold':
2688 unclosed_elements.append('b')
2689 elif k == 'fontStyle' and v == 'italic':
2691 unclosed_elements.append('i')
2692 elif k == 'textDecoration' and v == 'underline':
2694 unclosed_elements.append('u')
2696 self._out += '<font' + font + '>'
2697 unclosed_elements.append('font')
2699 if self._applied_styles:
2700 applied_style.update(self._applied_styles[-1])
2701 applied_style.update(style)
2702 self._applied_styles.append(applied_style)
2703 self._unclosed_elements.append(unclosed_elements)
2706 if tag not in (_x('ttml:br'), 'br'):
2707 unclosed_elements = self._unclosed_elements.pop()
2708 for element in reversed(unclosed_elements):
2709 self._out += '</%s>' % element
2710 if unclosed_elements and self._applied_styles:
2711 self._applied_styles.pop()
2713 def data(self, data):
2717 return self._out.strip()
2719 def parse_node(node):
2720 target = TTMLPElementParser()
2721 parser = xml.etree.ElementTree.XMLParser(target=target)
2722 parser.feed(xml.etree.ElementTree.tostring(node))
2723 return parser.close()
2725 for k, v in LEGACY_NAMESPACES:
2727 dfxp_data = dfxp_data.replace(ns, k)
2729 dfxp = compat_etree_fromstring(dfxp_data)
2731 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
2734 raise ValueError('Invalid dfxp/TTML subtitle')
2738 for style in dfxp.findall(_x('.//ttml:style')):
2739 style_id = style.get('id')
2740 parent_style_id = style.get('style')
2742 if parent_style_id not in styles:
2745 styles[style_id] = styles[parent_style_id].copy()
2746 for prop in SUPPORTED_STYLING:
2747 prop_val = style.get(_x('tts:' + prop))
2749 styles.setdefault(style_id, {})[prop] = prop_val
2755 for p in ('body', 'div'):
2756 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2759 style = styles.get(ele.get('style'))
2762 default_style.update(style)
2764 for para, index in zip(paras, itertools.count(1)):
2765 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2766 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2767 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2768 if begin_time is None:
2773 end_time = begin_time + dur
2774 out.append('%d\n%s --> %s\n%s\n\n' % (
2776 srt_subtitles_timecode(begin_time),
2777 srt_subtitles_timecode(end_time),
2783 def cli_option(params, command_option, param):
2784 param = params.get(param)
2786 param = compat_str(param)
2787 return [command_option, param] if param is not None else []
2790 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2791 param = params.get(param)
2794 assert isinstance(param, bool)
2796 return [command_option + separator + (true_value if param else false_value)]
2797 return [command_option, true_value if param else false_value]
2800 def cli_valueless_option(params, command_option, param, expected_value=True):
2801 param = params.get(param)
2802 return [command_option] if param == expected_value else []
2805 def cli_configuration_args(params, param, default=[]):
2806 ex_args = params.get(param)
2809 assert isinstance(ex_args, list)
2813 class ISO639Utils(object):
2814 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3003 def short2long(cls, code):
3004 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3005 return cls._lang_map.get(code[:2])
3008 def long2short(cls, code):
3009 """Convert language code from ISO 639-2/T to ISO 639-1"""
3010 for short_name, long_name in cls._lang_map.items():
3011 if long_name == code:
3015 class ISO3166Utils(object):
3016 # From http://data.okfn.org/data/core/country-list
3018 'AF': 'Afghanistan',
3019 'AX': 'Åland Islands',
3022 'AS': 'American Samoa',
3027 'AG': 'Antigua and Barbuda',
3044 'BO': 'Bolivia, Plurinational State of',
3045 'BQ': 'Bonaire, Sint Eustatius and Saba',
3046 'BA': 'Bosnia and Herzegovina',
3048 'BV': 'Bouvet Island',
3050 'IO': 'British Indian Ocean Territory',
3051 'BN': 'Brunei Darussalam',
3053 'BF': 'Burkina Faso',
3059 'KY': 'Cayman Islands',
3060 'CF': 'Central African Republic',
3064 'CX': 'Christmas Island',
3065 'CC': 'Cocos (Keeling) Islands',
3069 'CD': 'Congo, the Democratic Republic of the',
3070 'CK': 'Cook Islands',
3072 'CI': 'Côte d\'Ivoire',
3077 'CZ': 'Czech Republic',
3081 'DO': 'Dominican Republic',
3084 'SV': 'El Salvador',
3085 'GQ': 'Equatorial Guinea',
3089 'FK': 'Falkland Islands (Malvinas)',
3090 'FO': 'Faroe Islands',
3094 'GF': 'French Guiana',
3095 'PF': 'French Polynesia',
3096 'TF': 'French Southern Territories',
3111 'GW': 'Guinea-Bissau',
3114 'HM': 'Heard Island and McDonald Islands',
3115 'VA': 'Holy See (Vatican City State)',
3122 'IR': 'Iran, Islamic Republic of',
3125 'IM': 'Isle of Man',
3135 'KP': 'Korea, Democratic People\'s Republic of',
3136 'KR': 'Korea, Republic of',
3139 'LA': 'Lao People\'s Democratic Republic',
3145 'LI': 'Liechtenstein',
3149 'MK': 'Macedonia, the Former Yugoslav Republic of',
3156 'MH': 'Marshall Islands',
3162 'FM': 'Micronesia, Federated States of',
3163 'MD': 'Moldova, Republic of',
3174 'NL': 'Netherlands',
3175 'NC': 'New Caledonia',
3176 'NZ': 'New Zealand',
3181 'NF': 'Norfolk Island',
3182 'MP': 'Northern Mariana Islands',
3187 'PS': 'Palestine, State of',
3189 'PG': 'Papua New Guinea',
3192 'PH': 'Philippines',
3196 'PR': 'Puerto Rico',
3200 'RU': 'Russian Federation',
3202 'BL': 'Saint Barthélemy',
3203 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3204 'KN': 'Saint Kitts and Nevis',
3205 'LC': 'Saint Lucia',
3206 'MF': 'Saint Martin (French part)',
3207 'PM': 'Saint Pierre and Miquelon',
3208 'VC': 'Saint Vincent and the Grenadines',
3211 'ST': 'Sao Tome and Principe',
3212 'SA': 'Saudi Arabia',
3216 'SL': 'Sierra Leone',
3218 'SX': 'Sint Maarten (Dutch part)',
3221 'SB': 'Solomon Islands',
3223 'ZA': 'South Africa',
3224 'GS': 'South Georgia and the South Sandwich Islands',
3225 'SS': 'South Sudan',
3230 'SJ': 'Svalbard and Jan Mayen',
3233 'CH': 'Switzerland',
3234 'SY': 'Syrian Arab Republic',
3235 'TW': 'Taiwan, Province of China',
3237 'TZ': 'Tanzania, United Republic of',
3239 'TL': 'Timor-Leste',
3243 'TT': 'Trinidad and Tobago',
3246 'TM': 'Turkmenistan',
3247 'TC': 'Turks and Caicos Islands',
3251 'AE': 'United Arab Emirates',
3252 'GB': 'United Kingdom',
3253 'US': 'United States',
3254 'UM': 'United States Minor Outlying Islands',
3258 'VE': 'Venezuela, Bolivarian Republic of',
3260 'VG': 'Virgin Islands, British',
3261 'VI': 'Virgin Islands, U.S.',
3262 'WF': 'Wallis and Futuna',
3263 'EH': 'Western Sahara',
3270 def short2full(cls, code):
3271 """Convert an ISO 3166-2 country code to the corresponding full name"""
3272 return cls._country_map.get(code.upper())
3275 class GeoUtils(object):
3276 # Major IPv4 address blocks per country
3278 'AD': '85.94.160.0/19',
3279 'AE': '94.200.0.0/13',
3280 'AF': '149.54.0.0/17',
3281 'AG': '209.59.64.0/18',
3282 'AI': '204.14.248.0/21',
3283 'AL': '46.99.0.0/16',
3284 'AM': '46.70.0.0/15',
3285 'AO': '105.168.0.0/13',
3286 'AP': '159.117.192.0/21',
3287 'AR': '181.0.0.0/12',
3288 'AS': '202.70.112.0/20',
3289 'AT': '84.112.0.0/13',
3290 'AU': '1.128.0.0/11',
3291 'AW': '181.41.0.0/18',
3292 'AZ': '5.191.0.0/16',
3293 'BA': '31.176.128.0/17',
3294 'BB': '65.48.128.0/17',
3295 'BD': '114.130.0.0/16',
3297 'BF': '129.45.128.0/17',
3298 'BG': '95.42.0.0/15',
3299 'BH': '37.131.0.0/17',
3300 'BI': '154.117.192.0/18',
3301 'BJ': '137.255.0.0/16',
3302 'BL': '192.131.134.0/24',
3303 'BM': '196.12.64.0/18',
3304 'BN': '156.31.0.0/16',
3305 'BO': '161.56.0.0/16',
3306 'BQ': '161.0.80.0/20',
3307 'BR': '152.240.0.0/12',
3308 'BS': '24.51.64.0/18',
3309 'BT': '119.2.96.0/19',
3310 'BW': '168.167.0.0/16',
3311 'BY': '178.120.0.0/13',
3312 'BZ': '179.42.192.0/18',
3313 'CA': '99.224.0.0/11',
3314 'CD': '41.243.0.0/16',
3315 'CF': '196.32.200.0/21',
3316 'CG': '197.214.128.0/17',
3317 'CH': '85.0.0.0/13',
3318 'CI': '154.232.0.0/14',
3319 'CK': '202.65.32.0/19',
3320 'CL': '152.172.0.0/14',
3321 'CM': '165.210.0.0/15',
3322 'CN': '36.128.0.0/10',
3323 'CO': '181.240.0.0/12',
3324 'CR': '201.192.0.0/12',
3325 'CU': '152.206.0.0/15',
3326 'CV': '165.90.96.0/19',
3327 'CW': '190.88.128.0/17',
3328 'CY': '46.198.0.0/15',
3329 'CZ': '88.100.0.0/14',
3331 'DJ': '197.241.0.0/17',
3332 'DK': '87.48.0.0/12',
3333 'DM': '192.243.48.0/20',
3334 'DO': '152.166.0.0/15',
3335 'DZ': '41.96.0.0/12',
3336 'EC': '186.68.0.0/15',
3337 'EE': '90.190.0.0/15',
3338 'EG': '156.160.0.0/11',
3339 'ER': '196.200.96.0/20',
3340 'ES': '88.0.0.0/11',
3341 'ET': '196.188.0.0/14',
3342 'EU': '2.16.0.0/13',
3343 'FI': '91.152.0.0/13',
3344 'FJ': '144.120.0.0/16',
3345 'FM': '119.252.112.0/20',
3346 'FO': '88.85.32.0/19',
3348 'GA': '41.158.0.0/15',
3350 'GD': '74.122.88.0/21',
3351 'GE': '31.146.0.0/16',
3352 'GF': '161.22.64.0/18',
3353 'GG': '62.68.160.0/19',
3354 'GH': '45.208.0.0/14',
3355 'GI': '85.115.128.0/19',
3356 'GL': '88.83.0.0/19',
3357 'GM': '160.182.0.0/15',
3358 'GN': '197.149.192.0/18',
3359 'GP': '104.250.0.0/19',
3360 'GQ': '105.235.224.0/20',
3361 'GR': '94.64.0.0/13',
3362 'GT': '168.234.0.0/16',
3363 'GU': '168.123.0.0/16',
3364 'GW': '197.214.80.0/20',
3365 'GY': '181.41.64.0/18',
3366 'HK': '113.252.0.0/14',
3367 'HN': '181.210.0.0/16',
3368 'HR': '93.136.0.0/13',
3369 'HT': '148.102.128.0/17',
3370 'HU': '84.0.0.0/14',
3371 'ID': '39.192.0.0/10',
3372 'IE': '87.32.0.0/12',
3373 'IL': '79.176.0.0/13',
3374 'IM': '5.62.80.0/20',
3375 'IN': '117.192.0.0/10',
3376 'IO': '203.83.48.0/21',
3377 'IQ': '37.236.0.0/14',
3378 'IR': '2.176.0.0/12',
3379 'IS': '82.221.0.0/16',
3380 'IT': '79.0.0.0/10',
3381 'JE': '87.244.64.0/18',
3382 'JM': '72.27.0.0/17',
3383 'JO': '176.29.0.0/16',
3384 'JP': '126.0.0.0/8',
3385 'KE': '105.48.0.0/12',
3386 'KG': '158.181.128.0/17',
3387 'KH': '36.37.128.0/17',
3388 'KI': '103.25.140.0/22',
3389 'KM': '197.255.224.0/20',
3390 'KN': '198.32.32.0/19',
3391 'KP': '175.45.176.0/22',
3392 'KR': '175.192.0.0/10',
3393 'KW': '37.36.0.0/14',
3394 'KY': '64.96.0.0/15',
3395 'KZ': '2.72.0.0/13',
3396 'LA': '115.84.64.0/18',
3397 'LB': '178.135.0.0/16',
3398 'LC': '192.147.231.0/24',
3399 'LI': '82.117.0.0/19',
3400 'LK': '112.134.0.0/15',
3401 'LR': '41.86.0.0/19',
3402 'LS': '129.232.0.0/17',
3403 'LT': '78.56.0.0/13',
3404 'LU': '188.42.0.0/16',
3405 'LV': '46.109.0.0/16',
3406 'LY': '41.252.0.0/14',
3407 'MA': '105.128.0.0/11',
3408 'MC': '88.209.64.0/18',
3409 'MD': '37.246.0.0/16',
3410 'ME': '178.175.0.0/17',
3411 'MF': '74.112.232.0/21',
3412 'MG': '154.126.0.0/17',
3413 'MH': '117.103.88.0/21',
3414 'MK': '77.28.0.0/15',
3415 'ML': '154.118.128.0/18',
3416 'MM': '37.111.0.0/17',
3417 'MN': '49.0.128.0/17',
3418 'MO': '60.246.0.0/16',
3419 'MP': '202.88.64.0/20',
3420 'MQ': '109.203.224.0/19',
3421 'MR': '41.188.64.0/18',
3422 'MS': '208.90.112.0/22',
3423 'MT': '46.11.0.0/16',
3424 'MU': '105.16.0.0/12',
3425 'MV': '27.114.128.0/18',
3426 'MW': '105.234.0.0/16',
3427 'MX': '187.192.0.0/11',
3428 'MY': '175.136.0.0/13',
3429 'MZ': '197.218.0.0/15',
3430 'NA': '41.182.0.0/16',
3431 'NC': '101.101.0.0/18',
3432 'NE': '197.214.0.0/18',
3433 'NF': '203.17.240.0/22',
3434 'NG': '105.112.0.0/12',
3435 'NI': '186.76.0.0/15',
3436 'NL': '145.96.0.0/11',
3437 'NO': '84.208.0.0/13',
3438 'NP': '36.252.0.0/15',
3439 'NR': '203.98.224.0/19',
3440 'NU': '49.156.48.0/22',
3441 'NZ': '49.224.0.0/14',
3442 'OM': '5.36.0.0/15',
3443 'PA': '186.72.0.0/15',
3444 'PE': '186.160.0.0/14',
3445 'PF': '123.50.64.0/18',
3446 'PG': '124.240.192.0/19',
3447 'PH': '49.144.0.0/13',
3448 'PK': '39.32.0.0/11',
3449 'PL': '83.0.0.0/11',
3450 'PM': '70.36.0.0/20',
3451 'PR': '66.50.0.0/16',
3452 'PS': '188.161.0.0/16',
3453 'PT': '85.240.0.0/13',
3454 'PW': '202.124.224.0/20',
3455 'PY': '181.120.0.0/14',
3456 'QA': '37.210.0.0/15',
3457 'RE': '139.26.0.0/16',
3458 'RO': '79.112.0.0/13',
3459 'RS': '178.220.0.0/14',
3460 'RU': '5.136.0.0/13',
3461 'RW': '105.178.0.0/15',
3462 'SA': '188.48.0.0/13',
3463 'SB': '202.1.160.0/19',
3464 'SC': '154.192.0.0/11',
3465 'SD': '154.96.0.0/13',
3466 'SE': '78.64.0.0/12',
3467 'SG': '152.56.0.0/14',
3468 'SI': '188.196.0.0/14',
3469 'SK': '78.98.0.0/15',
3470 'SL': '197.215.0.0/17',
3471 'SM': '89.186.32.0/19',
3472 'SN': '41.82.0.0/15',
3473 'SO': '197.220.64.0/19',
3474 'SR': '186.179.128.0/17',
3475 'SS': '105.235.208.0/21',
3476 'ST': '197.159.160.0/19',
3477 'SV': '168.243.0.0/16',
3478 'SX': '190.102.0.0/20',
3480 'SZ': '41.84.224.0/19',
3481 'TC': '65.255.48.0/20',
3482 'TD': '154.68.128.0/19',
3483 'TG': '196.168.0.0/14',
3484 'TH': '171.96.0.0/13',
3485 'TJ': '85.9.128.0/18',
3486 'TK': '27.96.24.0/21',
3487 'TL': '180.189.160.0/20',
3488 'TM': '95.85.96.0/19',
3489 'TN': '197.0.0.0/11',
3490 'TO': '175.176.144.0/21',
3491 'TR': '78.160.0.0/11',
3492 'TT': '186.44.0.0/15',
3493 'TV': '202.2.96.0/19',
3494 'TW': '120.96.0.0/11',
3495 'TZ': '156.156.0.0/14',
3496 'UA': '93.72.0.0/13',
3497 'UG': '154.224.0.0/13',
3499 'UY': '167.56.0.0/13',
3500 'UZ': '82.215.64.0/18',
3501 'VA': '212.77.0.0/19',
3502 'VC': '24.92.144.0/20',
3503 'VE': '186.88.0.0/13',
3504 'VG': '172.103.64.0/18',
3505 'VI': '146.226.0.0/16',
3506 'VN': '14.160.0.0/11',
3507 'VU': '202.80.32.0/20',
3508 'WF': '117.20.32.0/21',
3509 'WS': '202.4.32.0/19',
3510 'YE': '134.35.0.0/16',
3511 'YT': '41.242.116.0/22',
3512 'ZA': '41.0.0.0/11',
3513 'ZM': '165.56.0.0/13',
3514 'ZW': '41.85.192.0/19',
3518 def random_ipv4(cls, code):
3519 block = cls._country_ip_map.get(code.upper())
3522 addr, preflen = block.split('/')
3523 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3524 addr_max = addr_min | (0xffffffff >> int(preflen))
3525 return compat_str(socket.inet_ntoa(
3526 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
3529 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3530 def __init__(self, proxies=None):
3531 # Set default handlers
3532 for type in ('http', 'https'):
3533 setattr(self, '%s_open' % type,
3534 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3535 meth(r, proxy, type))
3536 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3538 def proxy_open(self, req, proxy, type):
3539 req_proxy = req.headers.get('Ytdl-request-proxy')
3540 if req_proxy is not None:
3542 del req.headers['Ytdl-request-proxy']
3544 if proxy == '__noproxy__':
3545 return None # No Proxy
3546 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3547 req.add_header('Ytdl-socks-proxy', proxy)
3548 # youtube-dl's http/https handlers do wrapping the socket with socks
3550 return compat_urllib_request.ProxyHandler.proxy_open(
3551 self, req, proxy, type)
3554 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3555 # released into Public Domain
3556 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3558 def long_to_bytes(n, blocksize=0):
3559 """long_to_bytes(n:long, blocksize:int) : string
3560 Convert a long integer to a byte string.
3562 If optional blocksize is given and greater than zero, pad the front of the
3563 byte string with binary zeros so that the length is a multiple of
3566 # after much testing, this algorithm was deemed to be the fastest
3570 s = compat_struct_pack('>I', n & 0xffffffff) + s
3572 # strip off leading zeros
3573 for i in range(len(s)):
3574 if s[i] != b'\000'[0]:
3577 # only happens when n == 0
3581 # add back some pad bytes. this could be done more efficiently w.r.t. the
3582 # de-padding being done above, but sigh...
3583 if blocksize > 0 and len(s) % blocksize:
3584 s = (blocksize - len(s) % blocksize) * b'\000' + s
3588 def bytes_to_long(s):
3589 """bytes_to_long(string) : long
3590 Convert a byte string to a long integer.
3592 This is (essentially) the inverse of long_to_bytes().
3597 extra = (4 - length % 4)
3598 s = b'\000' * extra + s
3599 length = length + extra
3600 for i in range(0, length, 4):
3601 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3605 def ohdave_rsa_encrypt(data, exponent, modulus):
3607 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3610 data: data to encrypt, bytes-like object
3611 exponent, modulus: parameter e and N of RSA algorithm, both integer
3612 Output: hex string of encrypted data
3614 Limitation: supports one block encryption only
3617 payload = int(binascii.hexlify(data[::-1]), 16)
3618 encrypted = pow(payload, exponent, modulus)
3619 return '%x' % encrypted
3622 def pkcs1pad(data, length):
3624 Padding input data with PKCS#1 scheme
3626 @param {int[]} data input data
3627 @param {int} length target length
3628 @returns {int[]} padded data
3630 if len(data) > length - 11:
3631 raise ValueError('Input data too long for PKCS#1 padding')
3633 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3634 return [0, 2] + pseudo_random + [0] + data
3637 def encode_base_n(num, n, table=None):
3638 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3640 table = FULL_TABLE[:n]
3643 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3650 ret = table[num % n] + ret
3655 def decode_packed_codes(code):
3656 mobj = re.search(PACKED_CODES_RE, code)
3657 obfucasted_code, base, count, symbols = mobj.groups()
3660 symbols = symbols.split('|')
3665 base_n_count = encode_base_n(count, base)
3666 symbol_table[base_n_count] = symbols[count] or base_n_count
3669 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3673 def parse_m3u8_attributes(attrib):
3675 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3676 if val.startswith('"'):
3682 def urshift(val, n):
3683 return val >> n if val >= 0 else (val + 0x100000000) >> n
3686 # Based on png2str() written by @gdkchan and improved by @yokrysty
3687 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3688 def decode_png(png_data):
3689 # Reference: https://www.w3.org/TR/PNG/
3690 header = png_data[8:]
3692 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3693 raise IOError('Not a valid PNG file.')
3695 int_map = {1: '>B', 2: '>H', 4: '>I'}
3696 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3701 length = unpack_integer(header[:4])
3704 chunk_type = header[:4]
3707 chunk_data = header[:length]
3708 header = header[length:]
3710 header = header[4:] # Skip CRC
3718 ihdr = chunks[0]['data']
3720 width = unpack_integer(ihdr[:4])
3721 height = unpack_integer(ihdr[4:8])
3725 for chunk in chunks:
3726 if chunk['type'] == b'IDAT':
3727 idat += chunk['data']
3730 raise IOError('Unable to read PNG data.')
3732 decompressed_data = bytearray(zlib.decompress(idat))
3737 def _get_pixel(idx):
3742 for y in range(height):
3743 basePos = y * (1 + stride)
3744 filter_type = decompressed_data[basePos]
3748 pixels.append(current_row)
3750 for x in range(stride):
3751 color = decompressed_data[1 + basePos + x]
3752 basex = y * stride + x
3757 left = _get_pixel(basex - 3)
3759 up = _get_pixel(basex - stride)
3761 if filter_type == 1: # Sub
3762 color = (color + left) & 0xff
3763 elif filter_type == 2: # Up
3764 color = (color + up) & 0xff
3765 elif filter_type == 3: # Average
3766 color = (color + ((left + up) >> 1)) & 0xff
3767 elif filter_type == 4: # Paeth
3773 c = _get_pixel(basex - stride - 3)
3781 if pa <= pb and pa <= pc:
3782 color = (color + a) & 0xff
3784 color = (color + b) & 0xff
3786 color = (color + c) & 0xff
3788 current_row.append(color)
3790 return width, height, pixels
3793 def write_xattr(path, key, value):
3794 # This mess below finds the best xattr tool for the job
3796 # try the pyxattr module...
3799 if hasattr(xattr, 'set'): # pyxattr
3800 # Unicode arguments are not supported in python-pyxattr until
3802 # See https://github.com/rg3/youtube-dl/issues/5498
3803 pyxattr_required_version = '0.5.0'
3804 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3805 # TODO: fallback to CLI tools
3806 raise XAttrUnavailableError(
3807 'python-pyxattr is detected but is too old. '
3808 'youtube-dl requires %s or above while your version is %s. '
3809 'Falling back to other xattr implementations' % (
3810 pyxattr_required_version, xattr.__version__))
3812 setxattr = xattr.set
3814 setxattr = xattr.setxattr
3817 setxattr(path, key, value)
3818 except EnvironmentError as e:
3819 raise XAttrMetadataError(e.errno, e.strerror)
3822 if compat_os_name == 'nt':
3823 # Write xattrs to NTFS Alternate Data Streams:
3824 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3825 assert ':' not in key
3826 assert os.path.exists(path)
3828 ads_fn = path + ':' + key
3830 with open(ads_fn, 'wb') as f:
3832 except EnvironmentError as e:
3833 raise XAttrMetadataError(e.errno, e.strerror)
3835 user_has_setfattr = check_executable('setfattr', ['--version'])
3836 user_has_xattr = check_executable('xattr', ['-h'])
3838 if user_has_setfattr or user_has_xattr:
3840 value = value.decode('utf-8')
3841 if user_has_setfattr:
3842 executable = 'setfattr'
3843 opts = ['-n', key, '-v', value]
3844 elif user_has_xattr:
3845 executable = 'xattr'
3846 opts = ['-w', key, value]
3848 cmd = ([encodeFilename(executable, True)] +
3849 [encodeArgument(o) for o in opts] +
3850 [encodeFilename(path, True)])
3853 p = subprocess.Popen(
3854 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3855 except EnvironmentError as e:
3856 raise XAttrMetadataError(e.errno, e.strerror)
3857 stdout, stderr = p.communicate()
3858 stderr = stderr.decode('utf-8', 'replace')
3859 if p.returncode != 0:
3860 raise XAttrMetadataError(p.returncode, stderr)
3863 # On Unix, and can't find pyxattr, setfattr, or xattr.
3864 if sys.platform.startswith('linux'):
3865 raise XAttrUnavailableError(
3866 "Couldn't find a tool to set the xattrs. "
3867 "Install either the python 'pyxattr' or 'xattr' "
3868 "modules, or the GNU 'attr' package "
3869 "(which contains the 'setfattr' tool).")
3871 raise XAttrUnavailableError(
3872 "Couldn't find a tool to set the xattrs. "
3873 "Install either the python 'xattr' module, "
3874 "or the 'xattr' binary.")
3877 def random_birthday(year_field, month_field, day_field):
3879 year_field: str(random.randint(1950, 1995)),
3880 month_field: str(random.randint(1, 12)),
3881 day_field: str(random.randint(1, 31)),