4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
96 'en': ENGLISH_MONTH_NAMES,
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
115 'f4f', 'f4m', 'm3u8', 'smil')
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
136 '%Y-%m-%d %H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
143 '%Y-%m-%dT%H:%M:%S.%f',
146 '%b %d %Y at %H:%M:%S',
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
171 def preferredencoding():
172 """Get preferred encoding.
174 Returns the best encoding scheme for the system, based on
175 locale.getpreferredencoding() and some further tweaks.
178 pref = locale.getpreferredencoding()
186 def write_json_file(obj, fn):
187 """ Encode obj as JSON and write it to fn, atomically if possible """
189 fn = encodeFilename(fn)
190 if sys.version_info < (3, 0) and sys.platform != 'win32':
191 encoding = get_filesystem_encoding()
192 # os.path.basename returns a bytes object, but NamedTemporaryFile
193 # will fail if the filename contains non ascii characters unless we
194 # use a unicode object
195 path_basename = lambda f: os.path.basename(fn).decode(encoding)
196 # the same for os.path.dirname
197 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
199 path_basename = os.path.basename
200 path_dirname = os.path.dirname
204 'prefix': path_basename(fn) + '.',
205 'dir': path_dirname(fn),
209 # In Python 2.x, json.dump expects a bytestream.
210 # In Python 3.x, it writes to a character stream
211 if sys.version_info < (3, 0):
219 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
224 if sys.platform == 'win32':
225 # Need to remove existing file on Windows, else os.rename raises
226 # WindowsError or FileExistsError.
231 os.rename(tf.name, fn)
240 if sys.version_info >= (2, 7):
241 def find_xpath_attr(node, xpath, key, val=None):
242 """ Find the xpath xpath[@key=val] """
243 assert re.match(r'^[a-zA-Z_-]+$', key)
244 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
245 return node.find(expr)
247 def find_xpath_attr(node, xpath, key, val=None):
248 for f in node.findall(compat_xpath(xpath)):
249 if key not in f.attrib:
251 if val is None or f.attrib.get(key) == val:
255 # On python2.6 the xml.etree.ElementTree.Element methods don't support
256 # the namespace parameter
259 def xpath_with_ns(path, ns_map):
260 components = [c.split(':') for c in path.split('/')]
264 replaced.append(c[0])
267 replaced.append('{%s}%s' % (ns_map[ns], tag))
268 return '/'.join(replaced)
271 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
272 def _find_xpath(xpath):
273 return node.find(compat_xpath(xpath))
275 if isinstance(xpath, (str, compat_str)):
276 n = _find_xpath(xpath)
284 if default is not NO_DEFAULT:
287 name = xpath if name is None else name
288 raise ExtractorError('Could not find XML element %s' % name)
294 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
295 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
296 if n is None or n == default:
299 if default is not NO_DEFAULT:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element\'s text %s' % name)
309 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
310 n = find_xpath_attr(node, xpath, key)
312 if default is not NO_DEFAULT:
315 name = '%s[@%s]' % (xpath, key) if name is None else name
316 raise ExtractorError('Could not find XML attribute %s' % name)
322 def get_element_by_id(id, html):
323 """Return the content of the tag with the specified ID in the passed HTML document"""
324 return get_element_by_attribute('id', id, html)
327 def get_element_by_class(class_name, html):
328 return get_element_by_attribute(
329 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
330 html, escape_value=False)
333 def get_element_by_attribute(attribute, value, html, escape_value=True):
334 """Return the content of the tag with the specified attribute in the passed HTML document"""
336 value = re.escape(value) if escape_value else value
338 m = re.search(r'''(?xs)
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
342 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
346 ''' % (re.escape(attribute), value), html)
350 res = m.group('content')
352 if res.startswith('"') or res.startswith("'"):
355 return unescapeHTML(res)
358 class HTMLAttributeParser(compat_HTMLParser):
359 """Trivial HTML parser to gather the attributes for a single element"""
362 compat_HTMLParser.__init__(self)
364 def handle_starttag(self, tag, attrs):
365 self.attrs = dict(attrs)
368 def extract_attributes(html_element):
369 """Given a string for an HTML element such as
371 a="foo" B="bar" c="&98;az" d=boz
372 empty= noval entity="&"
375 Decode and return a dictionary of attributes.
377 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
378 'empty': '', 'noval': None, 'entity': '&',
379 'sq': '"', 'dq': '\''
381 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
382 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
384 parser = HTMLAttributeParser()
385 parser.feed(html_element)
390 def clean_html(html):
391 """Clean an HTML snippet into a readable string"""
393 if html is None: # Convenience for sanitizing descriptions etc.
397 html = html.replace('\n', ' ')
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
407 def sanitize_open(filename, open_mode):
408 """Try to open the given filename, and slightly tweak it if this fails.
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
415 It returns the tuple (stream, definitive_file_name).
419 if sys.platform == 'win32':
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
426 if err.errno in (errno.EACCES,):
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = sanitize_path(filename)
431 if alt_filename == filename:
434 # An exception here should be caught in the caller
435 stream = open(encodeFilename(alt_filename), open_mode)
436 return (stream, alt_filename)
439 def timeconvert(timestr):
440 """Convert RFC 2822 defined time string into system timestamp"""
442 timetuple = email.utils.parsedate_tz(timestr)
443 if timetuple is not None:
444 timestamp = email.utils.mktime_tz(timetuple)
448 def sanitize_filename(s, restricted=False, is_id=False):
449 """Sanitizes a string so it could be used as part of a filename.
450 If restricted is set, use a stricter subset of allowed characters.
451 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
453 def replace_insane(char):
454 if restricted and char in ACCENT_CHARS:
455 return ACCENT_CHARS[char]
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
459 return '' if restricted else '\''
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
466 if restricted and ord(char) > 127:
471 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
472 result = ''.join(map(replace_insane, s))
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
480 if result.startswith('-'):
481 result = '_' + result[len('-'):]
482 result = result.lstrip('.')
488 def sanitize_path(s):
489 """Sanitizes and normalizes path on Windows"""
490 if sys.platform != 'win32':
492 drive_or_unc, _ = os.path.splitdrive(s)
493 if sys.version_info < (2, 7) and not drive_or_unc:
494 drive_or_unc, _ = os.path.splitunc(s)
495 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
499 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
500 for path_part in norm_path]
502 sanitized_path.insert(0, drive_or_unc + os.path.sep)
503 return os.path.join(*sanitized_path)
506 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
507 # unwanted failures due to missing protocol
508 def sanitize_url(url):
509 return 'http:%s' % url if url.startswith('//') else url
512 def sanitized_Request(url, *args, **kwargs):
513 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
516 def orderedSet(iterable):
517 """ Remove all duplicates from the input iterable """
525 def _htmlentity_transform(entity_with_semicolon):
526 """Transforms an HTML entity to a character."""
527 entity = entity_with_semicolon[:-1]
529 # Known non-numeric HTML entity
530 if entity in compat_html_entities.name2codepoint:
531 return compat_chr(compat_html_entities.name2codepoint[entity])
533 # TODO: HTML5 allows entities without a semicolon. For example,
534 # 'Éric' should be decoded as 'Éric'.
535 if entity_with_semicolon in compat_html_entities_html5:
536 return compat_html_entities_html5[entity_with_semicolon]
538 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
540 numstr = mobj.group(1)
541 if numstr.startswith('x'):
543 numstr = '0%s' % numstr
546 # See https://github.com/rg3/youtube-dl/issues/7518
548 return compat_chr(int(numstr, base))
552 # Unknown entity in name, return its literal representation
553 return '&%s;' % entity
559 assert type(s) == compat_str
562 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
565 def get_subprocess_encoding():
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # For subprocess calls, encode with locale encoding
568 # Refer to http://stackoverflow.com/a/9951851/35070
569 encoding = preferredencoding()
571 encoding = sys.getfilesystemencoding()
577 def encodeFilename(s, for_subprocess=False):
579 @param s The name of the file
582 assert type(s) == compat_str
584 # Python 3 has a Unicode API
585 if sys.version_info >= (3, 0):
588 # Pass '' directly to use Unicode APIs on Windows 2000 and up
589 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
590 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
591 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
594 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
595 if sys.platform.startswith('java'):
598 return s.encode(get_subprocess_encoding(), 'ignore')
601 def decodeFilename(b, for_subprocess=False):
603 if sys.version_info >= (3, 0):
606 if not isinstance(b, bytes):
609 return b.decode(get_subprocess_encoding(), 'ignore')
612 def encodeArgument(s):
613 if not isinstance(s, compat_str):
614 # Legacy code that uses byte strings
615 # Uncomment the following line after fixing all post processors
616 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
617 s = s.decode('ascii')
618 return encodeFilename(s, True)
621 def decodeArgument(b):
622 return decodeFilename(b, True)
625 def decodeOption(optval):
628 if isinstance(optval, bytes):
629 optval = optval.decode(preferredencoding())
631 assert isinstance(optval, compat_str)
635 def formatSeconds(secs):
637 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
639 return '%d:%02d' % (secs // 60, secs % 60)
644 def make_HTTPS_handler(params, **kwargs):
645 opts_no_check_certificate = params.get('nocheckcertificate', False)
646 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
647 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
648 if opts_no_check_certificate:
649 context.check_hostname = False
650 context.verify_mode = ssl.CERT_NONE
652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
655 # (create_default_context present but HTTPSHandler has no context=)
658 if sys.version_info < (3, 2):
659 return YoutubeDLHTTPSHandler(params, **kwargs)
661 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
662 context.verify_mode = (ssl.CERT_NONE
663 if opts_no_check_certificate
664 else ssl.CERT_REQUIRED)
665 context.set_default_verify_paths()
666 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
669 def bug_reports_message():
670 if ytdl_is_updateable():
671 update_cmd = 'type youtube-dl -U to update'
673 update_cmd = 'see https://yt-dl.org/update on how to update'
674 msg = '; please report this issue on https://yt-dl.org/bug .'
675 msg += ' Make sure you are using the latest version; %s.' % update_cmd
676 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
680 class ExtractorError(Exception):
681 """Error during info extraction."""
683 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
684 """ tb, if given, is the original traceback (so that it can be printed out).
685 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
688 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
690 if video_id is not None:
691 msg = video_id + ': ' + msg
693 msg += ' (caused by %r)' % cause
695 msg += bug_reports_message()
696 super(ExtractorError, self).__init__(msg)
699 self.exc_info = sys.exc_info() # preserve original exception
701 self.video_id = video_id
703 def format_traceback(self):
704 if self.traceback is None:
706 return ''.join(traceback.format_tb(self.traceback))
709 class UnsupportedError(ExtractorError):
710 def __init__(self, url):
711 super(UnsupportedError, self).__init__(
712 'Unsupported URL: %s' % url, expected=True)
716 class RegexNotFoundError(ExtractorError):
717 """Error when a regex didn't match"""
721 class DownloadError(Exception):
722 """Download Error exception.
724 This exception may be thrown by FileDownloader objects if they are not
725 configured to continue on errors. They will contain the appropriate
729 def __init__(self, msg, exc_info=None):
730 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
731 super(DownloadError, self).__init__(msg)
732 self.exc_info = exc_info
735 class SameFileError(Exception):
736 """Same File exception.
738 This exception will be thrown by FileDownloader objects if they detect
739 multiple files would have to be downloaded to the same file on disk.
744 class PostProcessingError(Exception):
745 """Post Processing exception.
747 This exception may be raised by PostProcessor's .run() method to
748 indicate an error in the postprocessing task.
751 def __init__(self, msg):
755 class MaxDownloadsReached(Exception):
756 """ --max-downloads limit has been reached. """
760 class UnavailableVideoError(Exception):
761 """Unavailable Format exception.
763 This exception will be thrown when a video is requested
764 in a format that is not available for that video.
769 class ContentTooShortError(Exception):
770 """Content Too Short exception.
772 This exception may be raised by FileDownloader objects when a file they
773 download is too small for what the server announced first, indicating
774 the connection was probably interrupted.
777 def __init__(self, downloaded, expected):
779 self.downloaded = downloaded
780 self.expected = expected
783 class XAttrMetadataError(Exception):
784 def __init__(self, code=None, msg='Unknown error'):
785 super(XAttrMetadataError, self).__init__(msg)
789 # Parsing code and msg
790 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
791 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
792 self.reason = 'NO_SPACE'
793 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
794 self.reason = 'VALUE_TOO_LONG'
796 self.reason = 'NOT_SUPPORTED'
799 class XAttrUnavailableError(Exception):
803 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
804 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
805 # expected HTTP responses to meet HTTP/1.0 or later (see also
806 # https://github.com/rg3/youtube-dl/issues/6727)
807 if sys.version_info < (3, 0):
808 kwargs[b'strict'] = True
809 hc = http_class(*args, **kwargs)
810 source_address = ydl_handler._params.get('source_address')
811 if source_address is not None:
812 sa = (source_address, 0)
813 if hasattr(hc, 'source_address'): # Python 2.7+
814 hc.source_address = sa
816 def _hc_connect(self, *args, **kwargs):
817 sock = compat_socket_create_connection(
818 (self.host, self.port), self.timeout, sa)
820 self.sock = ssl.wrap_socket(
821 sock, self.key_file, self.cert_file,
822 ssl_version=ssl.PROTOCOL_TLSv1)
825 hc.connect = functools.partial(_hc_connect, hc)
830 def handle_youtubedl_headers(headers):
831 filtered_headers = headers
833 if 'Youtubedl-no-compression' in filtered_headers:
834 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
835 del filtered_headers['Youtubedl-no-compression']
837 return filtered_headers
840 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
841 """Handler for HTTP requests and responses.
843 This class, when installed with an OpenerDirector, automatically adds
844 the standard headers to every HTTP request and handles gzipped and
845 deflated responses from web servers. If compression is to be avoided in
846 a particular request, the original request in the program code only has
847 to include the HTTP header "Youtubedl-no-compression", which will be
848 removed before making the real request.
850 Part of this code was copied from:
852 http://techknack.net/python-urllib2-handlers/
854 Andrew Rowls, the author of that code, agreed to release it to the
858 def __init__(self, params, *args, **kwargs):
859 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
860 self._params = params
862 def http_open(self, req):
863 conn_class = compat_http_client.HTTPConnection
865 socks_proxy = req.headers.get('Ytdl-socks-proxy')
867 conn_class = make_socks_conn_class(conn_class, socks_proxy)
868 del req.headers['Ytdl-socks-proxy']
870 return self.do_open(functools.partial(
871 _create_http_connection, self, conn_class, False),
877 return zlib.decompress(data, -zlib.MAX_WBITS)
879 return zlib.decompress(data)
882 def addinfourl_wrapper(stream, headers, url, code):
883 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
884 return compat_urllib_request.addinfourl(stream, headers, url, code)
885 ret = compat_urllib_request.addinfourl(stream, headers, url)
889 def http_request(self, req):
890 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
891 # always respected by websites, some tend to give out URLs with non percent-encoded
892 # non-ASCII characters (see telemb.py, ard.py [#3412])
893 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
894 # To work around aforementioned issue we will replace request's original URL with
895 # percent-encoded one
896 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
897 # the code of this workaround has been moved here from YoutubeDL.urlopen()
898 url = req.get_full_url()
899 url_escaped = escape_url(url)
901 # Substitute URL if any change after escaping
902 if url != url_escaped:
903 req = update_Request(req, url=url_escaped)
905 for h, v in std_headers.items():
906 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
907 # The dict keys are capitalized because of this bug by urllib
908 if h.capitalize() not in req.headers:
911 req.headers = handle_youtubedl_headers(req.headers)
913 if sys.version_info < (2, 7) and '#' in req.get_full_url():
914 # Python 2.6 is brain-dead when it comes to fragments
915 req._Request__original = req._Request__original.partition('#')[0]
916 req._Request__r_type = req._Request__r_type.partition('#')[0]
920 def http_response(self, req, resp):
923 if resp.headers.get('Content-encoding', '') == 'gzip':
924 content = resp.read()
925 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
927 uncompressed = io.BytesIO(gz.read())
928 except IOError as original_ioerror:
929 # There may be junk add the end of the file
930 # See http://stackoverflow.com/q/4928560/35070 for details
931 for i in range(1, 1024):
933 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
934 uncompressed = io.BytesIO(gz.read())
939 raise original_ioerror
940 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
941 resp.msg = old_resp.msg
942 del resp.headers['Content-encoding']
944 if resp.headers.get('Content-encoding', '') == 'deflate':
945 gz = io.BytesIO(self.deflate(resp.read()))
946 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
947 resp.msg = old_resp.msg
948 del resp.headers['Content-encoding']
949 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
950 # https://github.com/rg3/youtube-dl/issues/6457).
951 if 300 <= resp.code < 400:
952 location = resp.headers.get('Location')
954 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
955 if sys.version_info >= (3, 0):
956 location = location.encode('iso-8859-1').decode('utf-8')
958 location = location.decode('utf-8')
959 location_escaped = escape_url(location)
960 if location != location_escaped:
961 del resp.headers['Location']
962 if sys.version_info < (3, 0):
963 location_escaped = location_escaped.encode('utf-8')
964 resp.headers['Location'] = location_escaped
967 https_request = http_request
968 https_response = http_response
971 def make_socks_conn_class(base_class, socks_proxy):
972 assert issubclass(base_class, (
973 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
975 url_components = compat_urlparse.urlparse(socks_proxy)
976 if url_components.scheme.lower() == 'socks5':
977 socks_type = ProxyType.SOCKS5
978 elif url_components.scheme.lower() in ('socks', 'socks4'):
979 socks_type = ProxyType.SOCKS4
980 elif url_components.scheme.lower() == 'socks4a':
981 socks_type = ProxyType.SOCKS4A
983 def unquote_if_non_empty(s):
986 return compat_urllib_parse_unquote_plus(s)
990 url_components.hostname, url_components.port or 1080,
992 unquote_if_non_empty(url_components.username),
993 unquote_if_non_empty(url_components.password),
996 class SocksConnection(base_class):
998 self.sock = sockssocket()
999 self.sock.setproxy(*proxy_args)
1000 if type(self.timeout) in (int, float):
1001 self.sock.settimeout(self.timeout)
1002 self.sock.connect((self.host, self.port))
1004 if isinstance(self, compat_http_client.HTTPSConnection):
1005 if hasattr(self, '_context'): # Python > 2.6
1006 self.sock = self._context.wrap_socket(
1007 self.sock, server_hostname=self.host)
1009 self.sock = ssl.wrap_socket(self.sock)
1011 return SocksConnection
1014 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1015 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1016 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1017 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1018 self._params = params
1020 def https_open(self, req):
1022 conn_class = self._https_conn_class
1024 if hasattr(self, '_context'): # python > 2.6
1025 kwargs['context'] = self._context
1026 if hasattr(self, '_check_hostname'): # python 3.x
1027 kwargs['check_hostname'] = self._check_hostname
1029 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1031 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1032 del req.headers['Ytdl-socks-proxy']
1034 return self.do_open(functools.partial(
1035 _create_http_connection, self, conn_class, True),
1039 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1040 def __init__(self, cookiejar=None):
1041 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1043 def http_response(self, request, response):
1044 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1045 # characters in Set-Cookie HTTP header of last response (see
1046 # https://github.com/rg3/youtube-dl/issues/6769).
1047 # In order to at least prevent crashing we will percent encode Set-Cookie
1048 # header before HTTPCookieProcessor starts processing it.
1049 # if sys.version_info < (3, 0) and response.headers:
1050 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1051 # set_cookie = response.headers.get(set_cookie_header)
1053 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1054 # if set_cookie != set_cookie_escaped:
1055 # del response.headers[set_cookie_header]
1056 # response.headers[set_cookie_header] = set_cookie_escaped
1057 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1059 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1060 https_response = http_response
1063 def extract_timezone(date_str):
1065 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1068 timezone = datetime.timedelta()
1070 date_str = date_str[:-len(m.group('tz'))]
1071 if not m.group('sign'):
1072 timezone = datetime.timedelta()
1074 sign = 1 if m.group('sign') == '+' else -1
1075 timezone = datetime.timedelta(
1076 hours=sign * int(m.group('hours')),
1077 minutes=sign * int(m.group('minutes')))
1078 return timezone, date_str
1081 def parse_iso8601(date_str, delimiter='T', timezone=None):
1082 """ Return a UNIX timestamp from the given date """
1084 if date_str is None:
1087 date_str = re.sub(r'\.[0-9]+', '', date_str)
1089 if timezone is None:
1090 timezone, date_str = extract_timezone(date_str)
1093 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1094 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1095 return calendar.timegm(dt.timetuple())
1100 def date_formats(day_first=True):
1101 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1104 def unified_strdate(date_str, day_first=True):
1105 """Return a string with the date in the format YYYYMMDD"""
1107 if date_str is None:
1111 date_str = date_str.replace(',', ' ')
1112 # Remove AM/PM + timezone
1113 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1114 _, date_str = extract_timezone(date_str)
1116 for expression in date_formats(day_first):
1118 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1121 if upload_date is None:
1122 timetuple = email.utils.parsedate_tz(date_str)
1125 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1128 if upload_date is not None:
1129 return compat_str(upload_date)
1132 def unified_timestamp(date_str, day_first=True):
1133 if date_str is None:
1136 date_str = date_str.replace(',', ' ')
1138 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1139 timezone, date_str = extract_timezone(date_str)
1141 # Remove AM/PM + timezone
1142 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1144 for expression in date_formats(day_first):
1146 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1147 return calendar.timegm(dt.timetuple())
1150 timetuple = email.utils.parsedate_tz(date_str)
1152 return calendar.timegm(timetuple) + pm_delta * 3600
1155 def determine_ext(url, default_ext='unknown_video'):
1158 guess = url.partition('?')[0].rpartition('.')[2]
1159 if re.match(r'^[A-Za-z0-9]+$', guess):
1161 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1162 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1163 return guess.rstrip('/')
1168 def subtitles_filename(filename, sub_lang, sub_format):
1169 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1172 def date_from_str(date_str):
1174 Return a datetime object from a string in the format YYYYMMDD or
1175 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1176 today = datetime.date.today()
1177 if date_str in ('now', 'today'):
1179 if date_str == 'yesterday':
1180 return today - datetime.timedelta(days=1)
1181 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1182 if match is not None:
1183 sign = match.group('sign')
1184 time = int(match.group('time'))
1187 unit = match.group('unit')
1188 # A bad approximation?
1192 elif unit == 'year':
1196 delta = datetime.timedelta(**{unit: time})
1197 return today + delta
1198 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1201 def hyphenate_date(date_str):
1203 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1204 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1205 if match is not None:
1206 return '-'.join(match.groups())
1211 class DateRange(object):
1212 """Represents a time interval between two dates"""
1214 def __init__(self, start=None, end=None):
1215 """start and end must be strings in the format accepted by date"""
1216 if start is not None:
1217 self.start = date_from_str(start)
1219 self.start = datetime.datetime.min.date()
1221 self.end = date_from_str(end)
1223 self.end = datetime.datetime.max.date()
1224 if self.start > self.end:
1225 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1229 """Returns a range that only contains the given day"""
1230 return cls(day, day)
1232 def __contains__(self, date):
1233 """Check if the date is in the range"""
1234 if not isinstance(date, datetime.date):
1235 date = date_from_str(date)
1236 return self.start <= date <= self.end
1239 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1242 def platform_name():
1243 """ Returns the platform name as a compat_str """
1244 res = platform.platform()
1245 if isinstance(res, bytes):
1246 res = res.decode(preferredencoding())
1248 assert isinstance(res, compat_str)
1252 def _windows_write_string(s, out):
1253 """ Returns True if the string was written using special methods,
1254 False if it has yet to be written out."""
1255 # Adapted from http://stackoverflow.com/a/3259271/35070
1258 import ctypes.wintypes
1266 fileno = out.fileno()
1267 except AttributeError:
1268 # If the output stream doesn't have a fileno, it's virtual
1270 except io.UnsupportedOperation:
1271 # Some strange Windows pseudo files?
1273 if fileno not in WIN_OUTPUT_IDS:
1276 GetStdHandle = ctypes.WINFUNCTYPE(
1277 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1278 (b'GetStdHandle', ctypes.windll.kernel32))
1279 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1281 WriteConsoleW = ctypes.WINFUNCTYPE(
1282 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1283 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1284 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1285 written = ctypes.wintypes.DWORD(0)
1287 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1288 FILE_TYPE_CHAR = 0x0002
1289 FILE_TYPE_REMOTE = 0x8000
1290 GetConsoleMode = ctypes.WINFUNCTYPE(
1291 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1292 ctypes.POINTER(ctypes.wintypes.DWORD))(
1293 (b'GetConsoleMode', ctypes.windll.kernel32))
1294 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1296 def not_a_console(handle):
1297 if handle == INVALID_HANDLE_VALUE or handle is None:
1299 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1300 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1302 if not_a_console(h):
1305 def next_nonbmp_pos(s):
1307 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1308 except StopIteration:
1312 count = min(next_nonbmp_pos(s), 1024)
1314 ret = WriteConsoleW(
1315 h, s, count if count else 2, ctypes.byref(written), None)
1317 raise OSError('Failed to write string')
1318 if not count: # We just wrote a non-BMP character
1319 assert written.value == 2
1322 assert written.value > 0
1323 s = s[written.value:]
1327 def write_string(s, out=None, encoding=None):
1330 assert type(s) == compat_str
1332 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1333 if _windows_write_string(s, out):
1336 if ('b' in getattr(out, 'mode', '') or
1337 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1338 byt = s.encode(encoding or preferredencoding(), 'ignore')
1340 elif hasattr(out, 'buffer'):
1341 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1342 byt = s.encode(enc, 'ignore')
1343 out.buffer.write(byt)
1349 def bytes_to_intlist(bs):
1352 if isinstance(bs[0], int): # Python 3
1355 return [ord(c) for c in bs]
1358 def intlist_to_bytes(xs):
1361 return compat_struct_pack('%dB' % len(xs), *xs)
1364 # Cross-platform file locking
1365 if sys.platform == 'win32':
1366 import ctypes.wintypes
1369 class OVERLAPPED(ctypes.Structure):
1371 ('Internal', ctypes.wintypes.LPVOID),
1372 ('InternalHigh', ctypes.wintypes.LPVOID),
1373 ('Offset', ctypes.wintypes.DWORD),
1374 ('OffsetHigh', ctypes.wintypes.DWORD),
1375 ('hEvent', ctypes.wintypes.HANDLE),
1378 kernel32 = ctypes.windll.kernel32
1379 LockFileEx = kernel32.LockFileEx
1380 LockFileEx.argtypes = [
1381 ctypes.wintypes.HANDLE, # hFile
1382 ctypes.wintypes.DWORD, # dwFlags
1383 ctypes.wintypes.DWORD, # dwReserved
1384 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1385 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1386 ctypes.POINTER(OVERLAPPED) # Overlapped
1388 LockFileEx.restype = ctypes.wintypes.BOOL
1389 UnlockFileEx = kernel32.UnlockFileEx
1390 UnlockFileEx.argtypes = [
1391 ctypes.wintypes.HANDLE, # hFile
1392 ctypes.wintypes.DWORD, # dwReserved
1393 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1394 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1395 ctypes.POINTER(OVERLAPPED) # Overlapped
1397 UnlockFileEx.restype = ctypes.wintypes.BOOL
1398 whole_low = 0xffffffff
1399 whole_high = 0x7fffffff
1401 def _lock_file(f, exclusive):
1402 overlapped = OVERLAPPED()
1403 overlapped.Offset = 0
1404 overlapped.OffsetHigh = 0
1405 overlapped.hEvent = 0
1406 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1407 handle = msvcrt.get_osfhandle(f.fileno())
1408 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1409 whole_low, whole_high, f._lock_file_overlapped_p):
1410 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1412 def _unlock_file(f):
1413 assert f._lock_file_overlapped_p
1414 handle = msvcrt.get_osfhandle(f.fileno())
1415 if not UnlockFileEx(handle, 0,
1416 whole_low, whole_high, f._lock_file_overlapped_p):
1417 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1420 # Some platforms, such as Jython, is missing fcntl
1424 def _lock_file(f, exclusive):
1425 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1427 def _unlock_file(f):
1428 fcntl.flock(f, fcntl.LOCK_UN)
1430 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1432 def _lock_file(f, exclusive):
1433 raise IOError(UNSUPPORTED_MSG)
1435 def _unlock_file(f):
1436 raise IOError(UNSUPPORTED_MSG)
1439 class locked_file(object):
1440 def __init__(self, filename, mode, encoding=None):
1441 assert mode in ['r', 'a', 'w']
1442 self.f = io.open(filename, mode, encoding=encoding)
1445 def __enter__(self):
1446 exclusive = self.mode != 'r'
1448 _lock_file(self.f, exclusive)
1454 def __exit__(self, etype, value, traceback):
1456 _unlock_file(self.f)
1463 def write(self, *args):
1464 return self.f.write(*args)
1466 def read(self, *args):
1467 return self.f.read(*args)
1470 def get_filesystem_encoding():
1471 encoding = sys.getfilesystemencoding()
1472 return encoding if encoding is not None else 'utf-8'
1475 def shell_quote(args):
1477 encoding = get_filesystem_encoding()
1479 if isinstance(a, bytes):
1480 # We may get a filename encoded with 'encodeFilename'
1481 a = a.decode(encoding)
1482 quoted_args.append(pipes.quote(a))
1483 return ' '.join(quoted_args)
1486 def smuggle_url(url, data):
1487 """ Pass additional data in a URL for internal use. """
1489 url, idata = unsmuggle_url(url, {})
1491 sdata = compat_urllib_parse_urlencode(
1492 {'__youtubedl_smuggle': json.dumps(data)})
1493 return url + '#' + sdata
1496 def unsmuggle_url(smug_url, default=None):
1497 if '#__youtubedl_smuggle' not in smug_url:
1498 return smug_url, default
1499 url, _, sdata = smug_url.rpartition('#')
1500 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1501 data = json.loads(jsond)
1505 def format_bytes(bytes):
1508 if type(bytes) is str:
1509 bytes = float(bytes)
1513 exponent = int(math.log(bytes, 1024.0))
1514 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1515 converted = float(bytes) / float(1024 ** exponent)
1516 return '%.2f%s' % (converted, suffix)
1519 def lookup_unit_table(unit_table, s):
1520 units_re = '|'.join(re.escape(u) for u in unit_table)
1522 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1525 num_str = m.group('num').replace(',', '.')
1526 mult = unit_table[m.group('unit')]
1527 return int(float(num_str) * mult)
1530 def parse_filesize(s):
1534 # The lower-case forms are of course incorrect and unofficial,
1535 # but we support those too
1552 'megabytes': 1000 ** 2,
1553 'mebibytes': 1024 ** 2,
1559 'gigabytes': 1000 ** 3,
1560 'gibibytes': 1024 ** 3,
1566 'terabytes': 1000 ** 4,
1567 'tebibytes': 1024 ** 4,
1573 'petabytes': 1000 ** 5,
1574 'pebibytes': 1024 ** 5,
1580 'exabytes': 1000 ** 6,
1581 'exbibytes': 1024 ** 6,
1587 'zettabytes': 1000 ** 7,
1588 'zebibytes': 1024 ** 7,
1594 'yottabytes': 1000 ** 8,
1595 'yobibytes': 1024 ** 8,
1598 return lookup_unit_table(_UNIT_TABLE, s)
1607 if re.match(r'^[\d,.]+$', s):
1608 return str_to_int(s)
1619 return lookup_unit_table(_UNIT_TABLE, s)
1622 def month_by_name(name, lang='en'):
1623 """ Return the number of a month by (locale-independently) English name """
1625 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1628 return month_names.index(name) + 1
1633 def month_by_abbreviation(abbrev):
1634 """ Return the number of a month by (locale-independently) English
1638 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1643 def fix_xml_ampersands(xml_str):
1644 """Replace all the '&' by '&' in XML"""
1646 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1651 def setproctitle(title):
1652 assert isinstance(title, compat_str)
1654 # ctypes in Jython is not complete
1655 # http://bugs.jython.org/issue2148
1656 if sys.platform.startswith('java'):
1660 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1663 title_bytes = title.encode('utf-8')
1664 buf = ctypes.create_string_buffer(len(title_bytes))
1665 buf.value = title_bytes
1667 libc.prctl(15, buf, 0, 0, 0)
1668 except AttributeError:
1669 return # Strange libc, just skip this
1672 def remove_start(s, start):
1673 return s[len(start):] if s is not None and s.startswith(start) else s
1676 def remove_end(s, end):
1677 return s[:-len(end)] if s is not None and s.endswith(end) else s
1680 def remove_quotes(s):
1681 if s is None or len(s) < 2:
1683 for quote in ('"', "'", ):
1684 if s[0] == quote and s[-1] == quote:
1689 def url_basename(url):
1690 path = compat_urlparse.urlparse(url).path
1691 return path.strip('/').split('/')[-1]
1694 class HEADRequest(compat_urllib_request.Request):
1695 def get_method(self):
1699 class PUTRequest(compat_urllib_request.Request):
1700 def get_method(self):
1704 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1707 v = getattr(v, get_attr, None)
1713 return int(v) * invscale // scale
1718 def str_or_none(v, default=None):
1719 return default if v is None else compat_str(v)
1722 def str_to_int(int_str):
1723 """ A more relaxed version of int_or_none """
1726 int_str = re.sub(r'[,\.\+]', '', int_str)
1730 def float_or_none(v, scale=1, invscale=1, default=None):
1734 return float(v) * invscale / scale
1739 def strip_or_none(v):
1740 return None if v is None else v.strip()
1743 def parse_duration(s):
1744 if not isinstance(s, compat_basestring):
1749 days, hours, mins, secs, ms = [None] * 5
1750 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1752 days, hours, mins, secs, ms = m.groups()
1757 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1760 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1763 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1766 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1769 days, hours, mins, secs, ms = m.groups()
1771 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1773 hours, mins = m.groups()
1779 duration += float(secs)
1781 duration += float(mins) * 60
1783 duration += float(hours) * 60 * 60
1785 duration += float(days) * 24 * 60 * 60
1787 duration += float(ms)
1791 def prepend_extension(filename, ext, expected_real_ext=None):
1792 name, real_ext = os.path.splitext(filename)
1794 '{0}.{1}{2}'.format(name, ext, real_ext)
1795 if not expected_real_ext or real_ext[1:] == expected_real_ext
1796 else '{0}.{1}'.format(filename, ext))
1799 def replace_extension(filename, ext, expected_real_ext=None):
1800 name, real_ext = os.path.splitext(filename)
1801 return '{0}.{1}'.format(
1802 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1806 def check_executable(exe, args=[]):
1807 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1808 args can be a list of arguments for a short output (like -version) """
1810 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1816 def get_exe_version(exe, args=['--version'],
1817 version_re=None, unrecognized='present'):
1818 """ Returns the version of the specified executable,
1819 or False if the executable is not present """
1821 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1822 # SIGTTOU if youtube-dl is run in the background.
1823 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1824 out, _ = subprocess.Popen(
1825 [encodeArgument(exe)] + args,
1826 stdin=subprocess.PIPE,
1827 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1830 if isinstance(out, bytes): # Python 2.x
1831 out = out.decode('ascii', 'ignore')
1832 return detect_exe_version(out, version_re, unrecognized)
1835 def detect_exe_version(output, version_re=None, unrecognized='present'):
1836 assert isinstance(output, compat_str)
1837 if version_re is None:
1838 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1839 m = re.search(version_re, output)
1846 class PagedList(object):
1848 # This is only useful for tests
1849 return len(self.getslice())
1852 class OnDemandPagedList(PagedList):
1853 def __init__(self, pagefunc, pagesize, use_cache=False):
1854 self._pagefunc = pagefunc
1855 self._pagesize = pagesize
1856 self._use_cache = use_cache
1860 def getslice(self, start=0, end=None):
1862 for pagenum in itertools.count(start // self._pagesize):
1863 firstid = pagenum * self._pagesize
1864 nextfirstid = pagenum * self._pagesize + self._pagesize
1865 if start >= nextfirstid:
1870 page_results = self._cache.get(pagenum)
1871 if page_results is None:
1872 page_results = list(self._pagefunc(pagenum))
1874 self._cache[pagenum] = page_results
1877 start % self._pagesize
1878 if firstid <= start < nextfirstid
1882 ((end - 1) % self._pagesize) + 1
1883 if (end is not None and firstid <= end <= nextfirstid)
1886 if startv != 0 or endv is not None:
1887 page_results = page_results[startv:endv]
1888 res.extend(page_results)
1890 # A little optimization - if current page is not "full", ie. does
1891 # not contain page_size videos then we can assume that this page
1892 # is the last one - there are no more ids on further pages -
1893 # i.e. no need to query again.
1894 if len(page_results) + startv < self._pagesize:
1897 # If we got the whole page, but the next page is not interesting,
1898 # break out early as well
1899 if end == nextfirstid:
1904 class InAdvancePagedList(PagedList):
1905 def __init__(self, pagefunc, pagecount, pagesize):
1906 self._pagefunc = pagefunc
1907 self._pagecount = pagecount
1908 self._pagesize = pagesize
1910 def getslice(self, start=0, end=None):
1912 start_page = start // self._pagesize
1914 self._pagecount if end is None else (end // self._pagesize + 1))
1915 skip_elems = start - start_page * self._pagesize
1916 only_more = None if end is None else end - start
1917 for pagenum in range(start_page, end_page):
1918 page = list(self._pagefunc(pagenum))
1920 page = page[skip_elems:]
1922 if only_more is not None:
1923 if len(page) < only_more:
1924 only_more -= len(page)
1926 page = page[:only_more]
1933 def uppercase_escape(s):
1934 unicode_escape = codecs.getdecoder('unicode_escape')
1936 r'\\U[0-9a-fA-F]{8}',
1937 lambda m: unicode_escape(m.group(0))[0],
1941 def lowercase_escape(s):
1942 unicode_escape = codecs.getdecoder('unicode_escape')
1944 r'\\u[0-9a-fA-F]{4}',
1945 lambda m: unicode_escape(m.group(0))[0],
1949 def escape_rfc3986(s):
1950 """Escape non-ASCII characters as suggested by RFC 3986"""
1951 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1952 s = s.encode('utf-8')
1953 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1956 def escape_url(url):
1957 """Escape URL as suggested by RFC 3986"""
1958 url_parsed = compat_urllib_parse_urlparse(url)
1959 return url_parsed._replace(
1960 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1961 path=escape_rfc3986(url_parsed.path),
1962 params=escape_rfc3986(url_parsed.params),
1963 query=escape_rfc3986(url_parsed.query),
1964 fragment=escape_rfc3986(url_parsed.fragment)
1968 def read_batch_urls(batch_fd):
1970 if not isinstance(url, compat_str):
1971 url = url.decode('utf-8', 'replace')
1972 BOM_UTF8 = '\xef\xbb\xbf'
1973 if url.startswith(BOM_UTF8):
1974 url = url[len(BOM_UTF8):]
1976 if url.startswith(('#', ';', ']')):
1980 with contextlib.closing(batch_fd) as fd:
1981 return [url for url in map(fixup, fd) if url]
1984 def urlencode_postdata(*args, **kargs):
1985 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1988 def update_url_query(url, query):
1991 parsed_url = compat_urlparse.urlparse(url)
1992 qs = compat_parse_qs(parsed_url.query)
1994 return compat_urlparse.urlunparse(parsed_url._replace(
1995 query=compat_urllib_parse_urlencode(qs, True)))
1998 def update_Request(req, url=None, data=None, headers={}, query={}):
1999 req_headers = req.headers.copy()
2000 req_headers.update(headers)
2001 req_data = data or req.data
2002 req_url = update_url_query(url or req.get_full_url(), query)
2003 req_get_method = req.get_method()
2004 if req_get_method == 'HEAD':
2005 req_type = HEADRequest
2006 elif req_get_method == 'PUT':
2007 req_type = PUTRequest
2009 req_type = compat_urllib_request.Request
2011 req_url, data=req_data, headers=req_headers,
2012 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2013 if hasattr(req, 'timeout'):
2014 new_req.timeout = req.timeout
2018 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2019 if isinstance(key_or_keys, (list, tuple)):
2020 for key in key_or_keys:
2021 if key not in d or d[key] is None or skip_false_values and not d[key]:
2025 return d.get(key_or_keys, default)
2028 def try_get(src, getter, expected_type=None):
2031 except (AttributeError, KeyError, TypeError, IndexError):
2034 if expected_type is None or isinstance(v, expected_type):
2038 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2039 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2051 TV_PARENTAL_GUIDELINES = {
2061 def parse_age_limit(s):
2063 return s if 0 <= s <= 21 else None
2064 if not isinstance(s, compat_basestring):
2066 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2068 return int(m.group('age'))
2070 return US_RATINGS[s]
2071 return TV_PARENTAL_GUIDELINES.get(s)
2074 def strip_jsonp(code):
2076 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2079 def js_to_json(code):
2082 if v in ('true', 'false', 'null'):
2084 elif v.startswith('/*') or v == ',':
2087 if v[0] in ("'", '"'):
2088 v = re.sub(r'(?s)\\.|"', lambda m: {
2093 }.get(m.group(0), m.group(0)), v[1:-1])
2096 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2097 (r'^(0+[0-7]+)\s*:?$', 8),
2100 for regex, base in INTEGER_TABLE:
2101 im = re.match(regex, v)
2103 i = int(im.group(1), base)
2104 return '"%d":' % i if v.endswith(':') else '%d' % i
2108 return re.sub(r'''(?sx)
2109 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2110 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2111 /\*.*?\*/|,(?=\s*[\]}])|
2112 [a-zA-Z_][.a-zA-Z_0-9]*|
2113 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2118 def qualities(quality_ids):
2119 """ Get a numeric quality value out of a list of possible values """
2122 return quality_ids.index(qid)
2128 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2131 def limit_length(s, length):
2132 """ Add ellipses to overly long strings """
2137 return s[:length - len(ELLIPSES)] + ELLIPSES
2141 def version_tuple(v):
2142 return tuple(int(e) for e in re.split(r'[-.]', v))
2145 def is_outdated_version(version, limit, assume_new=True):
2147 return not assume_new
2149 return version_tuple(version) < version_tuple(limit)
2151 return not assume_new
2154 def ytdl_is_updateable():
2155 """ Returns if youtube-dl can be updated with -U """
2156 from zipimport import zipimporter
2158 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2161 def args_to_str(args):
2162 # Get a short string representation for a subprocess command
2163 return ' '.join(compat_shlex_quote(a) for a in args)
2166 def error_to_compat_str(err):
2168 # On python 2 error byte string must be decoded with proper
2169 # encoding rather than ascii
2170 if sys.version_info[0] < 3:
2171 err_str = err_str.decode(preferredencoding())
2175 def mimetype2ext(mt):
2181 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2182 # it's the most popular one
2183 'audio/mpeg': 'mp3',
2188 _, _, res = mt.rpartition('/')
2189 res = res.split(';')[0].strip().lower()
2193 'smptett+xml': 'tt',
2199 'x-mp4-fragmented': 'mp4',
2202 'x-mpegurl': 'm3u8',
2203 'vnd.apple.mpegurl': 'm3u8',
2208 'vnd.ms-sstr+xml': 'ism',
2213 def parse_codecs(codecs_str):
2214 # http://tools.ietf.org/html/rfc6381
2217 splited_codecs = list(filter(None, map(
2218 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2219 vcodec, acodec = None, None
2220 for full_codec in splited_codecs:
2221 codec = full_codec.split('.')[0]
2222 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2225 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2229 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2230 if not vcodec and not acodec:
2231 if len(splited_codecs) == 2:
2236 elif len(splited_codecs) == 1:
2243 'vcodec': vcodec or 'none',
2244 'acodec': acodec or 'none',
2249 def urlhandle_detect_ext(url_handle):
2250 getheader = url_handle.headers.get
2252 cd = getheader('Content-Disposition')
2254 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2256 e = determine_ext(m.group('filename'), default_ext=None)
2260 return mimetype2ext(getheader('Content-Type'))
2263 def encode_data_uri(data, mime_type):
2264 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2267 def age_restricted(content_limit, age_limit):
2268 """ Returns True iff the content should be blocked """
2270 if age_limit is None: # No limit set
2272 if content_limit is None:
2273 return False # Content available for everyone
2274 return age_limit < content_limit
2277 def is_html(first_bytes):
2278 """ Detect whether a file contains HTML by examining its first bytes. """
2281 (b'\xef\xbb\xbf', 'utf-8'),
2282 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2283 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2284 (b'\xff\xfe', 'utf-16-le'),
2285 (b'\xfe\xff', 'utf-16-be'),
2287 for bom, enc in BOMS:
2288 if first_bytes.startswith(bom):
2289 s = first_bytes[len(bom):].decode(enc, 'replace')
2292 s = first_bytes.decode('utf-8', 'replace')
2294 return re.match(r'^\s*<', s)
2297 def determine_protocol(info_dict):
2298 protocol = info_dict.get('protocol')
2299 if protocol is not None:
2302 url = info_dict['url']
2303 if url.startswith('rtmp'):
2305 elif url.startswith('mms'):
2307 elif url.startswith('rtsp'):
2310 ext = determine_ext(url)
2316 return compat_urllib_parse_urlparse(url).scheme
2319 def render_table(header_row, data):
2320 """ Render a list of rows, each as a list of values """
2321 table = [header_row] + data
2322 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2323 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2324 return '\n'.join(format_str % tuple(row) for row in table)
2327 def _match_one(filter_part, dct):
2328 COMPARISON_OPERATORS = {
2336 operator_rex = re.compile(r'''(?x)\s*
2338 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2340 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2341 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2344 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2345 m = operator_rex.search(filter_part)
2347 op = COMPARISON_OPERATORS[m.group('op')]
2348 if m.group('strval') is not None:
2349 if m.group('op') not in ('=', '!='):
2351 'Operator %s does not support string values!' % m.group('op'))
2352 comparison_value = m.group('strval')
2355 comparison_value = int(m.group('intval'))
2357 comparison_value = parse_filesize(m.group('intval'))
2358 if comparison_value is None:
2359 comparison_value = parse_filesize(m.group('intval') + 'B')
2360 if comparison_value is None:
2362 'Invalid integer value %r in filter part %r' % (
2363 m.group('intval'), filter_part))
2364 actual_value = dct.get(m.group('key'))
2365 if actual_value is None:
2366 return m.group('none_inclusive')
2367 return op(actual_value, comparison_value)
2370 '': lambda v: v is not None,
2371 '!': lambda v: v is None,
2373 operator_rex = re.compile(r'''(?x)\s*
2374 (?P<op>%s)\s*(?P<key>[a-z_]+)
2376 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2377 m = operator_rex.search(filter_part)
2379 op = UNARY_OPERATORS[m.group('op')]
2380 actual_value = dct.get(m.group('key'))
2381 return op(actual_value)
2383 raise ValueError('Invalid filter part %r' % filter_part)
2386 def match_str(filter_str, dct):
2387 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2390 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2393 def match_filter_func(filter_str):
2394 def _match_func(info_dict):
2395 if match_str(filter_str, info_dict):
2398 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2399 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2403 def parse_dfxp_time_expr(time_expr):
2407 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2409 return float(mobj.group('time_offset'))
2411 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2413 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2416 def srt_subtitles_timecode(seconds):
2417 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2420 def dfxp2srt(dfxp_data):
2421 _x = functools.partial(xpath_with_ns, ns_map={
2422 'ttml': 'http://www.w3.org/ns/ttml',
2423 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2424 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2427 class TTMLPElementParser(object):
2430 def start(self, tag, attrib):
2431 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2437 def data(self, data):
2441 return self.out.strip()
2443 def parse_node(node):
2444 target = TTMLPElementParser()
2445 parser = xml.etree.ElementTree.XMLParser(target=target)
2446 parser.feed(xml.etree.ElementTree.tostring(node))
2447 return parser.close()
2449 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2451 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2454 raise ValueError('Invalid dfxp/TTML subtitle')
2456 for para, index in zip(paras, itertools.count(1)):
2457 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2458 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2459 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2460 if begin_time is None:
2465 end_time = begin_time + dur
2466 out.append('%d\n%s --> %s\n%s\n\n' % (
2468 srt_subtitles_timecode(begin_time),
2469 srt_subtitles_timecode(end_time),
2475 def cli_option(params, command_option, param):
2476 param = params.get(param)
2478 param = compat_str(param)
2479 return [command_option, param] if param is not None else []
2482 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2483 param = params.get(param)
2484 assert isinstance(param, bool)
2486 return [command_option + separator + (true_value if param else false_value)]
2487 return [command_option, true_value if param else false_value]
2490 def cli_valueless_option(params, command_option, param, expected_value=True):
2491 param = params.get(param)
2492 return [command_option] if param == expected_value else []
2495 def cli_configuration_args(params, param, default=[]):
2496 ex_args = params.get(param)
2499 assert isinstance(ex_args, list)
2503 class ISO639Utils(object):
2504 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2693 def short2long(cls, code):
2694 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2695 return cls._lang_map.get(code[:2])
2698 def long2short(cls, code):
2699 """Convert language code from ISO 639-2/T to ISO 639-1"""
2700 for short_name, long_name in cls._lang_map.items():
2701 if long_name == code:
2705 class ISO3166Utils(object):
2706 # From http://data.okfn.org/data/core/country-list
2708 'AF': 'Afghanistan',
2709 'AX': 'Åland Islands',
2712 'AS': 'American Samoa',
2717 'AG': 'Antigua and Barbuda',
2734 'BO': 'Bolivia, Plurinational State of',
2735 'BQ': 'Bonaire, Sint Eustatius and Saba',
2736 'BA': 'Bosnia and Herzegovina',
2738 'BV': 'Bouvet Island',
2740 'IO': 'British Indian Ocean Territory',
2741 'BN': 'Brunei Darussalam',
2743 'BF': 'Burkina Faso',
2749 'KY': 'Cayman Islands',
2750 'CF': 'Central African Republic',
2754 'CX': 'Christmas Island',
2755 'CC': 'Cocos (Keeling) Islands',
2759 'CD': 'Congo, the Democratic Republic of the',
2760 'CK': 'Cook Islands',
2762 'CI': 'Côte d\'Ivoire',
2767 'CZ': 'Czech Republic',
2771 'DO': 'Dominican Republic',
2774 'SV': 'El Salvador',
2775 'GQ': 'Equatorial Guinea',
2779 'FK': 'Falkland Islands (Malvinas)',
2780 'FO': 'Faroe Islands',
2784 'GF': 'French Guiana',
2785 'PF': 'French Polynesia',
2786 'TF': 'French Southern Territories',
2801 'GW': 'Guinea-Bissau',
2804 'HM': 'Heard Island and McDonald Islands',
2805 'VA': 'Holy See (Vatican City State)',
2812 'IR': 'Iran, Islamic Republic of',
2815 'IM': 'Isle of Man',
2825 'KP': 'Korea, Democratic People\'s Republic of',
2826 'KR': 'Korea, Republic of',
2829 'LA': 'Lao People\'s Democratic Republic',
2835 'LI': 'Liechtenstein',
2839 'MK': 'Macedonia, the Former Yugoslav Republic of',
2846 'MH': 'Marshall Islands',
2852 'FM': 'Micronesia, Federated States of',
2853 'MD': 'Moldova, Republic of',
2864 'NL': 'Netherlands',
2865 'NC': 'New Caledonia',
2866 'NZ': 'New Zealand',
2871 'NF': 'Norfolk Island',
2872 'MP': 'Northern Mariana Islands',
2877 'PS': 'Palestine, State of',
2879 'PG': 'Papua New Guinea',
2882 'PH': 'Philippines',
2886 'PR': 'Puerto Rico',
2890 'RU': 'Russian Federation',
2892 'BL': 'Saint Barthélemy',
2893 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2894 'KN': 'Saint Kitts and Nevis',
2895 'LC': 'Saint Lucia',
2896 'MF': 'Saint Martin (French part)',
2897 'PM': 'Saint Pierre and Miquelon',
2898 'VC': 'Saint Vincent and the Grenadines',
2901 'ST': 'Sao Tome and Principe',
2902 'SA': 'Saudi Arabia',
2906 'SL': 'Sierra Leone',
2908 'SX': 'Sint Maarten (Dutch part)',
2911 'SB': 'Solomon Islands',
2913 'ZA': 'South Africa',
2914 'GS': 'South Georgia and the South Sandwich Islands',
2915 'SS': 'South Sudan',
2920 'SJ': 'Svalbard and Jan Mayen',
2923 'CH': 'Switzerland',
2924 'SY': 'Syrian Arab Republic',
2925 'TW': 'Taiwan, Province of China',
2927 'TZ': 'Tanzania, United Republic of',
2929 'TL': 'Timor-Leste',
2933 'TT': 'Trinidad and Tobago',
2936 'TM': 'Turkmenistan',
2937 'TC': 'Turks and Caicos Islands',
2941 'AE': 'United Arab Emirates',
2942 'GB': 'United Kingdom',
2943 'US': 'United States',
2944 'UM': 'United States Minor Outlying Islands',
2948 'VE': 'Venezuela, Bolivarian Republic of',
2950 'VG': 'Virgin Islands, British',
2951 'VI': 'Virgin Islands, U.S.',
2952 'WF': 'Wallis and Futuna',
2953 'EH': 'Western Sahara',
2960 def short2full(cls, code):
2961 """Convert an ISO 3166-2 country code to the corresponding full name"""
2962 return cls._country_map.get(code.upper())
2965 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2966 def __init__(self, proxies=None):
2967 # Set default handlers
2968 for type in ('http', 'https'):
2969 setattr(self, '%s_open' % type,
2970 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2971 meth(r, proxy, type))
2972 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2974 def proxy_open(self, req, proxy, type):
2975 req_proxy = req.headers.get('Ytdl-request-proxy')
2976 if req_proxy is not None:
2978 del req.headers['Ytdl-request-proxy']
2980 if proxy == '__noproxy__':
2981 return None # No Proxy
2982 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2983 req.add_header('Ytdl-socks-proxy', proxy)
2984 # youtube-dl's http/https handlers do wrapping the socket with socks
2986 return compat_urllib_request.ProxyHandler.proxy_open(
2987 self, req, proxy, type)
2990 def ohdave_rsa_encrypt(data, exponent, modulus):
2992 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2995 data: data to encrypt, bytes-like object
2996 exponent, modulus: parameter e and N of RSA algorithm, both integer
2997 Output: hex string of encrypted data
2999 Limitation: supports one block encryption only
3002 payload = int(binascii.hexlify(data[::-1]), 16)
3003 encrypted = pow(payload, exponent, modulus)
3004 return '%x' % encrypted
3007 def encode_base_n(num, n, table=None):
3008 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3010 table = FULL_TABLE[:n]
3013 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3020 ret = table[num % n] + ret
3025 def decode_packed_codes(code):
3026 mobj = re.search(PACKED_CODES_RE, code)
3027 obfucasted_code, base, count, symbols = mobj.groups()
3030 symbols = symbols.split('|')
3035 base_n_count = encode_base_n(count, base)
3036 symbol_table[base_n_count] = symbols[count] or base_n_count
3039 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3043 def parse_m3u8_attributes(attrib):
3045 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3046 if val.startswith('"'):
3052 def urshift(val, n):
3053 return val >> n if val >= 0 else (val + 0x100000000) >> n
3056 # Based on png2str() written by @gdkchan and improved by @yokrysty
3057 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3058 def decode_png(png_data):
3059 # Reference: https://www.w3.org/TR/PNG/
3060 header = png_data[8:]
3062 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3063 raise IOError('Not a valid PNG file.')
3065 int_map = {1: '>B', 2: '>H', 4: '>I'}
3066 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3071 length = unpack_integer(header[:4])
3074 chunk_type = header[:4]
3077 chunk_data = header[:length]
3078 header = header[length:]
3080 header = header[4:] # Skip CRC
3088 ihdr = chunks[0]['data']
3090 width = unpack_integer(ihdr[:4])
3091 height = unpack_integer(ihdr[4:8])
3095 for chunk in chunks:
3096 if chunk['type'] == b'IDAT':
3097 idat += chunk['data']
3100 raise IOError('Unable to read PNG data.')
3102 decompressed_data = bytearray(zlib.decompress(idat))
3107 def _get_pixel(idx):
3112 for y in range(height):
3113 basePos = y * (1 + stride)
3114 filter_type = decompressed_data[basePos]
3118 pixels.append(current_row)
3120 for x in range(stride):
3121 color = decompressed_data[1 + basePos + x]
3122 basex = y * stride + x
3127 left = _get_pixel(basex - 3)
3129 up = _get_pixel(basex - stride)
3131 if filter_type == 1: # Sub
3132 color = (color + left) & 0xff
3133 elif filter_type == 2: # Up
3134 color = (color + up) & 0xff
3135 elif filter_type == 3: # Average
3136 color = (color + ((left + up) >> 1)) & 0xff
3137 elif filter_type == 4: # Paeth
3143 c = _get_pixel(basex - stride - 3)
3151 if pa <= pb and pa <= pc:
3152 color = (color + a) & 0xff
3154 color = (color + b) & 0xff
3156 color = (color + c) & 0xff
3158 current_row.append(color)
3160 return width, height, pixels
3163 def write_xattr(path, key, value):
3164 # This mess below finds the best xattr tool for the job
3166 # try the pyxattr module...
3169 if hasattr(xattr, 'set'): # pyxattr
3170 # Unicode arguments are not supported in python-pyxattr until
3172 # See https://github.com/rg3/youtube-dl/issues/5498
3173 pyxattr_required_version = '0.5.0'
3174 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3175 # TODO: fallback to CLI tools
3176 raise XAttrUnavailableError(
3177 'python-pyxattr is detected but is too old. '
3178 'youtube-dl requires %s or above while your version is %s. '
3179 'Falling back to other xattr implementations' % (
3180 pyxattr_required_version, xattr.__version__))
3182 setxattr = xattr.set
3184 setxattr = xattr.setxattr
3187 setxattr(path, key, value)
3188 except EnvironmentError as e:
3189 raise XAttrMetadataError(e.errno, e.strerror)
3192 if compat_os_name == 'nt':
3193 # Write xattrs to NTFS Alternate Data Streams:
3194 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3195 assert ':' not in key
3196 assert os.path.exists(path)
3198 ads_fn = path + ':' + key
3200 with open(ads_fn, 'wb') as f:
3202 except EnvironmentError as e:
3203 raise XAttrMetadataError(e.errno, e.strerror)
3205 user_has_setfattr = check_executable('setfattr', ['--version'])
3206 user_has_xattr = check_executable('xattr', ['-h'])
3208 if user_has_setfattr or user_has_xattr:
3210 value = value.decode('utf-8')
3211 if user_has_setfattr:
3212 executable = 'setfattr'
3213 opts = ['-n', key, '-v', value]
3214 elif user_has_xattr:
3215 executable = 'xattr'
3216 opts = ['-w', key, value]
3218 cmd = ([encodeFilename(executable, True)] +
3219 [encodeArgument(o) for o in opts] +
3220 [encodeFilename(path, True)])
3223 p = subprocess.Popen(
3224 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3225 except EnvironmentError as e:
3226 raise XAttrMetadataError(e.errno, e.strerror)
3227 stdout, stderr = p.communicate()
3228 stderr = stderr.decode('utf-8', 'replace')
3229 if p.returncode != 0:
3230 raise XAttrMetadataError(p.returncode, stderr)
3233 # On Unix, and can't find pyxattr, setfattr, or xattr.
3234 if sys.platform.startswith('linux'):
3235 raise XAttrUnavailableError(
3236 "Couldn't find a tool to set the xattrs. "
3237 "Install either the python 'pyxattr' or 'xattr' "
3238 "modules, or the GNU 'attr' package "
3239 "(which contains the 'setfattr' tool).")
3241 raise XAttrUnavailableError(
3242 "Couldn't find a tool to set the xattrs. "
3243 "Install either the python 'xattr' module, "
3244 "or the 'xattr' binary.")