2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
96 'en': ENGLISH_MONTH_NAMES,
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
115 'f4f', 'f4m', 'm3u8', 'smil')
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
128 '%b %d %Y at %H:%M:%S',
138 '%Y-%m-%d %H:%M:%S.%f',
141 '%Y-%m-%dT%H:%M:%SZ',
142 '%Y-%m-%dT%H:%M:%S.%fZ',
143 '%Y-%m-%dT%H:%M:%S.%f0Z',
145 '%Y-%m-%dT%H:%M:%S.%f',
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
169 def preferredencoding():
170 """Get preferred encoding.
172 Returns the best encoding scheme for the system, based on
173 locale.getpreferredencoding() and some further tweaks.
176 pref = locale.getpreferredencoding()
184 def write_json_file(obj, fn):
185 """ Encode obj as JSON and write it to fn, atomically if possible """
187 fn = encodeFilename(fn)
188 if sys.version_info < (3, 0) and sys.platform != 'win32':
189 encoding = get_filesystem_encoding()
190 # os.path.basename returns a bytes object, but NamedTemporaryFile
191 # will fail if the filename contains non ascii characters unless we
192 # use a unicode object
193 path_basename = lambda f: os.path.basename(fn).decode(encoding)
194 # the same for os.path.dirname
195 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
197 path_basename = os.path.basename
198 path_dirname = os.path.dirname
202 'prefix': path_basename(fn) + '.',
203 'dir': path_dirname(fn),
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys.version_info < (3, 0):
217 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
222 if sys.platform == 'win32':
223 # Need to remove existing file on Windows, else os.rename raises
224 # WindowsError or FileExistsError.
229 os.rename(tf.name, fn)
238 if sys.version_info >= (2, 7):
239 def find_xpath_attr(node, xpath, key, val=None):
240 """ Find the xpath xpath[@key=val] """
241 assert re.match(r'^[a-zA-Z_-]+$', key)
242 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
243 return node.find(expr)
245 def find_xpath_attr(node, xpath, key, val=None):
246 for f in node.findall(compat_xpath(xpath)):
247 if key not in f.attrib:
249 if val is None or f.attrib.get(key) == val:
253 # On python2.6 the xml.etree.ElementTree.Element methods don't support
254 # the namespace parameter
257 def xpath_with_ns(path, ns_map):
258 components = [c.split(':') for c in path.split('/')]
262 replaced.append(c[0])
265 replaced.append('{%s}%s' % (ns_map[ns], tag))
266 return '/'.join(replaced)
269 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
270 def _find_xpath(xpath):
271 return node.find(compat_xpath(xpath))
273 if isinstance(xpath, (str, compat_str)):
274 n = _find_xpath(xpath)
282 if default is not NO_DEFAULT:
285 name = xpath if name is None else name
286 raise ExtractorError('Could not find XML element %s' % name)
292 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
293 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
294 if n is None or n == default:
297 if default is not NO_DEFAULT:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element\'s text %s' % name)
307 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
308 n = find_xpath_attr(node, xpath, key)
310 if default is not NO_DEFAULT:
313 name = '%s[@%s]' % (xpath, key) if name is None else name
314 raise ExtractorError('Could not find XML attribute %s' % name)
320 def get_element_by_id(id, html):
321 """Return the content of the tag with the specified ID in the passed HTML document"""
322 return get_element_by_attribute('id', id, html)
325 def get_element_by_class(class_name, html):
326 return get_element_by_attribute(
327 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
328 html, escape_value=False)
331 def get_element_by_attribute(attribute, value, html, escape_value=True):
332 """Return the content of the tag with the specified attribute in the passed HTML document"""
334 value = re.escape(value) if escape_value else value
336 m = re.search(r'''(?xs)
338 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
344 ''' % (re.escape(attribute), value), html)
348 res = m.group('content')
350 if res.startswith('"') or res.startswith("'"):
353 return unescapeHTML(res)
356 class HTMLAttributeParser(compat_HTMLParser):
357 """Trivial HTML parser to gather the attributes for a single element"""
360 compat_HTMLParser.__init__(self)
362 def handle_starttag(self, tag, attrs):
363 self.attrs = dict(attrs)
366 def extract_attributes(html_element):
367 """Given a string for an HTML element such as
369 a="foo" B="bar" c="&98;az" d=boz
370 empty= noval entity="&"
373 Decode and return a dictionary of attributes.
375 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
376 'empty': '', 'noval': None, 'entity': '&',
377 'sq': '"', 'dq': '\''
379 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
380 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
382 parser = HTMLAttributeParser()
383 parser.feed(html_element)
388 def clean_html(html):
389 """Clean an HTML snippet into a readable string"""
391 if html is None: # Convenience for sanitizing descriptions etc.
395 html = html.replace('\n', ' ')
396 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
397 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
399 html = re.sub('<.*?>', '', html)
400 # Replace html entities
401 html = unescapeHTML(html)
405 def sanitize_open(filename, open_mode):
406 """Try to open the given filename, and slightly tweak it if this fails.
408 Attempts to open the given filename. If this fails, it tries to change
409 the filename slightly, step by step, until it's either able to open it
410 or it fails and raises a final exception, like the standard open()
413 It returns the tuple (stream, definitive_file_name).
417 if sys.platform == 'win32':
419 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
420 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
421 stream = open(encodeFilename(filename), open_mode)
422 return (stream, filename)
423 except (IOError, OSError) as err:
424 if err.errno in (errno.EACCES,):
427 # In case of error, try to remove win32 forbidden chars
428 alt_filename = sanitize_path(filename)
429 if alt_filename == filename:
432 # An exception here should be caught in the caller
433 stream = open(encodeFilename(alt_filename), open_mode)
434 return (stream, alt_filename)
437 def timeconvert(timestr):
438 """Convert RFC 2822 defined time string into system timestamp"""
440 timetuple = email.utils.parsedate_tz(timestr)
441 if timetuple is not None:
442 timestamp = email.utils.mktime_tz(timetuple)
446 def sanitize_filename(s, restricted=False, is_id=False):
447 """Sanitizes a string so it could be used as part of a filename.
448 If restricted is set, use a stricter subset of allowed characters.
449 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
451 def replace_insane(char):
452 if restricted and char in ACCENT_CHARS:
453 return ACCENT_CHARS[char]
454 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return '' if restricted else '\''
459 return '_-' if restricted else ' -'
460 elif char in '\\/|*<>':
462 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
464 if restricted and ord(char) > 127:
469 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
470 result = ''.join(map(replace_insane, s))
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
478 if result.startswith('-'):
479 result = '_' + result[len('-'):]
480 result = result.lstrip('.')
486 def sanitize_path(s):
487 """Sanitizes and normalizes path on Windows"""
488 if sys.platform != 'win32':
490 drive_or_unc, _ = os.path.splitdrive(s)
491 if sys.version_info < (2, 7) and not drive_or_unc:
492 drive_or_unc, _ = os.path.splitunc(s)
493 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
497 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
498 for path_part in norm_path]
500 sanitized_path.insert(0, drive_or_unc + os.path.sep)
501 return os.path.join(*sanitized_path)
504 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
505 # unwanted failures due to missing protocol
506 def sanitize_url(url):
507 return 'http:%s' % url if url.startswith('//') else url
510 def sanitized_Request(url, *args, **kwargs):
511 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
514 def orderedSet(iterable):
515 """ Remove all duplicates from the input iterable """
523 def _htmlentity_transform(entity_with_semicolon):
524 """Transforms an HTML entity to a character."""
525 entity = entity_with_semicolon[:-1]
527 # Known non-numeric HTML entity
528 if entity in compat_html_entities.name2codepoint:
529 return compat_chr(compat_html_entities.name2codepoint[entity])
531 # TODO: HTML5 allows entities without a semicolon. For example,
532 # 'Éric' should be decoded as 'Éric'.
533 if entity_with_semicolon in compat_html_entities_html5:
534 return compat_html_entities_html5[entity_with_semicolon]
536 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
538 numstr = mobj.group(1)
539 if numstr.startswith('x'):
541 numstr = '0%s' % numstr
544 # See https://github.com/rg3/youtube-dl/issues/7518
546 return compat_chr(int(numstr, base))
550 # Unknown entity in name, return its literal representation
551 return '&%s;' % entity
557 assert type(s) == compat_str
560 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
563 def get_subprocess_encoding():
564 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
565 # For subprocess calls, encode with locale encoding
566 # Refer to http://stackoverflow.com/a/9951851/35070
567 encoding = preferredencoding()
569 encoding = sys.getfilesystemencoding()
575 def encodeFilename(s, for_subprocess=False):
577 @param s The name of the file
580 assert type(s) == compat_str
582 # Python 3 has a Unicode API
583 if sys.version_info >= (3, 0):
586 # Pass '' directly to use Unicode APIs on Windows 2000 and up
587 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
588 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
589 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
592 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
593 if sys.platform.startswith('java'):
596 return s.encode(get_subprocess_encoding(), 'ignore')
599 def decodeFilename(b, for_subprocess=False):
601 if sys.version_info >= (3, 0):
604 if not isinstance(b, bytes):
607 return b.decode(get_subprocess_encoding(), 'ignore')
610 def encodeArgument(s):
611 if not isinstance(s, compat_str):
612 # Legacy code that uses byte strings
613 # Uncomment the following line after fixing all post processors
614 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
615 s = s.decode('ascii')
616 return encodeFilename(s, True)
619 def decodeArgument(b):
620 return decodeFilename(b, True)
623 def decodeOption(optval):
626 if isinstance(optval, bytes):
627 optval = optval.decode(preferredencoding())
629 assert isinstance(optval, compat_str)
633 def formatSeconds(secs):
635 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
637 return '%d:%02d' % (secs // 60, secs % 60)
642 def make_HTTPS_handler(params, **kwargs):
643 opts_no_check_certificate = params.get('nocheckcertificate', False)
644 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
645 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
646 if opts_no_check_certificate:
647 context.check_hostname = False
648 context.verify_mode = ssl.CERT_NONE
650 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
653 # (create_default_context present but HTTPSHandler has no context=)
656 if sys.version_info < (3, 2):
657 return YoutubeDLHTTPSHandler(params, **kwargs)
659 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
660 context.verify_mode = (ssl.CERT_NONE
661 if opts_no_check_certificate
662 else ssl.CERT_REQUIRED)
663 context.set_default_verify_paths()
664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
667 def bug_reports_message():
668 if ytdl_is_updateable():
669 update_cmd = 'type youtube-dl -U to update'
671 update_cmd = 'see https://yt-dl.org/update on how to update'
672 msg = '; please report this issue on https://yt-dl.org/bug .'
673 msg += ' Make sure you are using the latest version; %s.' % update_cmd
674 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
678 class ExtractorError(Exception):
679 """Error during info extraction."""
681 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
682 """ tb, if given, is the original traceback (so that it can be printed out).
683 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
686 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
688 if video_id is not None:
689 msg = video_id + ': ' + msg
691 msg += ' (caused by %r)' % cause
693 msg += bug_reports_message()
694 super(ExtractorError, self).__init__(msg)
697 self.exc_info = sys.exc_info() # preserve original exception
699 self.video_id = video_id
701 def format_traceback(self):
702 if self.traceback is None:
704 return ''.join(traceback.format_tb(self.traceback))
707 class UnsupportedError(ExtractorError):
708 def __init__(self, url):
709 super(UnsupportedError, self).__init__(
710 'Unsupported URL: %s' % url, expected=True)
714 class RegexNotFoundError(ExtractorError):
715 """Error when a regex didn't match"""
719 class DownloadError(Exception):
720 """Download Error exception.
722 This exception may be thrown by FileDownloader objects if they are not
723 configured to continue on errors. They will contain the appropriate
727 def __init__(self, msg, exc_info=None):
728 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
729 super(DownloadError, self).__init__(msg)
730 self.exc_info = exc_info
733 class SameFileError(Exception):
734 """Same File exception.
736 This exception will be thrown by FileDownloader objects if they detect
737 multiple files would have to be downloaded to the same file on disk.
742 class PostProcessingError(Exception):
743 """Post Processing exception.
745 This exception may be raised by PostProcessor's .run() method to
746 indicate an error in the postprocessing task.
749 def __init__(self, msg):
753 class MaxDownloadsReached(Exception):
754 """ --max-downloads limit has been reached. """
758 class UnavailableVideoError(Exception):
759 """Unavailable Format exception.
761 This exception will be thrown when a video is requested
762 in a format that is not available for that video.
767 class ContentTooShortError(Exception):
768 """Content Too Short exception.
770 This exception may be raised by FileDownloader objects when a file they
771 download is too small for what the server announced first, indicating
772 the connection was probably interrupted.
775 def __init__(self, downloaded, expected):
777 self.downloaded = downloaded
778 self.expected = expected
781 class XAttrMetadataError(Exception):
782 def __init__(self, code=None, msg='Unknown error'):
783 super(XAttrMetadataError, self).__init__(msg)
786 # Parsing code and msg
787 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
788 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
789 self.reason = 'NO_SPACE'
790 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
791 self.reason = 'VALUE_TOO_LONG'
793 self.reason = 'NOT_SUPPORTED'
796 class XAttrUnavailableError(Exception):
800 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
801 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
802 # expected HTTP responses to meet HTTP/1.0 or later (see also
803 # https://github.com/rg3/youtube-dl/issues/6727)
804 if sys.version_info < (3, 0):
805 kwargs[b'strict'] = True
806 hc = http_class(*args, **kwargs)
807 source_address = ydl_handler._params.get('source_address')
808 if source_address is not None:
809 sa = (source_address, 0)
810 if hasattr(hc, 'source_address'): # Python 2.7+
811 hc.source_address = sa
813 def _hc_connect(self, *args, **kwargs):
814 sock = compat_socket_create_connection(
815 (self.host, self.port), self.timeout, sa)
817 self.sock = ssl.wrap_socket(
818 sock, self.key_file, self.cert_file,
819 ssl_version=ssl.PROTOCOL_TLSv1)
822 hc.connect = functools.partial(_hc_connect, hc)
827 def handle_youtubedl_headers(headers):
828 filtered_headers = headers
830 if 'Youtubedl-no-compression' in filtered_headers:
831 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
832 del filtered_headers['Youtubedl-no-compression']
834 return filtered_headers
837 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
838 """Handler for HTTP requests and responses.
840 This class, when installed with an OpenerDirector, automatically adds
841 the standard headers to every HTTP request and handles gzipped and
842 deflated responses from web servers. If compression is to be avoided in
843 a particular request, the original request in the program code only has
844 to include the HTTP header "Youtubedl-no-compression", which will be
845 removed before making the real request.
847 Part of this code was copied from:
849 http://techknack.net/python-urllib2-handlers/
851 Andrew Rowls, the author of that code, agreed to release it to the
855 def __init__(self, params, *args, **kwargs):
856 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
857 self._params = params
859 def http_open(self, req):
860 conn_class = compat_http_client.HTTPConnection
862 socks_proxy = req.headers.get('Ytdl-socks-proxy')
864 conn_class = make_socks_conn_class(conn_class, socks_proxy)
865 del req.headers['Ytdl-socks-proxy']
867 return self.do_open(functools.partial(
868 _create_http_connection, self, conn_class, False),
874 return zlib.decompress(data, -zlib.MAX_WBITS)
876 return zlib.decompress(data)
879 def addinfourl_wrapper(stream, headers, url, code):
880 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
881 return compat_urllib_request.addinfourl(stream, headers, url, code)
882 ret = compat_urllib_request.addinfourl(stream, headers, url)
886 def http_request(self, req):
887 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
888 # always respected by websites, some tend to give out URLs with non percent-encoded
889 # non-ASCII characters (see telemb.py, ard.py [#3412])
890 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
891 # To work around aforementioned issue we will replace request's original URL with
892 # percent-encoded one
893 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
894 # the code of this workaround has been moved here from YoutubeDL.urlopen()
895 url = req.get_full_url()
896 url_escaped = escape_url(url)
898 # Substitute URL if any change after escaping
899 if url != url_escaped:
900 req = update_Request(req, url=url_escaped)
902 for h, v in std_headers.items():
903 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
904 # The dict keys are capitalized because of this bug by urllib
905 if h.capitalize() not in req.headers:
908 req.headers = handle_youtubedl_headers(req.headers)
910 if sys.version_info < (2, 7) and '#' in req.get_full_url():
911 # Python 2.6 is brain-dead when it comes to fragments
912 req._Request__original = req._Request__original.partition('#')[0]
913 req._Request__r_type = req._Request__r_type.partition('#')[0]
917 def http_response(self, req, resp):
920 if resp.headers.get('Content-encoding', '') == 'gzip':
921 content = resp.read()
922 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
924 uncompressed = io.BytesIO(gz.read())
925 except IOError as original_ioerror:
926 # There may be junk add the end of the file
927 # See http://stackoverflow.com/q/4928560/35070 for details
928 for i in range(1, 1024):
930 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
931 uncompressed = io.BytesIO(gz.read())
936 raise original_ioerror
937 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
938 resp.msg = old_resp.msg
939 del resp.headers['Content-encoding']
941 if resp.headers.get('Content-encoding', '') == 'deflate':
942 gz = io.BytesIO(self.deflate(resp.read()))
943 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
944 resp.msg = old_resp.msg
945 del resp.headers['Content-encoding']
946 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
947 # https://github.com/rg3/youtube-dl/issues/6457).
948 if 300 <= resp.code < 400:
949 location = resp.headers.get('Location')
951 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
952 if sys.version_info >= (3, 0):
953 location = location.encode('iso-8859-1').decode('utf-8')
955 location = location.decode('utf-8')
956 location_escaped = escape_url(location)
957 if location != location_escaped:
958 del resp.headers['Location']
959 if sys.version_info < (3, 0):
960 location_escaped = location_escaped.encode('utf-8')
961 resp.headers['Location'] = location_escaped
964 https_request = http_request
965 https_response = http_response
968 def make_socks_conn_class(base_class, socks_proxy):
969 assert issubclass(base_class, (
970 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
972 url_components = compat_urlparse.urlparse(socks_proxy)
973 if url_components.scheme.lower() == 'socks5':
974 socks_type = ProxyType.SOCKS5
975 elif url_components.scheme.lower() in ('socks', 'socks4'):
976 socks_type = ProxyType.SOCKS4
977 elif url_components.scheme.lower() == 'socks4a':
978 socks_type = ProxyType.SOCKS4A
980 def unquote_if_non_empty(s):
983 return compat_urllib_parse_unquote_plus(s)
987 url_components.hostname, url_components.port or 1080,
989 unquote_if_non_empty(url_components.username),
990 unquote_if_non_empty(url_components.password),
993 class SocksConnection(base_class):
995 self.sock = sockssocket()
996 self.sock.setproxy(*proxy_args)
997 if type(self.timeout) in (int, float):
998 self.sock.settimeout(self.timeout)
999 self.sock.connect((self.host, self.port))
1001 if isinstance(self, compat_http_client.HTTPSConnection):
1002 if hasattr(self, '_context'): # Python > 2.6
1003 self.sock = self._context.wrap_socket(
1004 self.sock, server_hostname=self.host)
1006 self.sock = ssl.wrap_socket(self.sock)
1008 return SocksConnection
1011 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1012 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1013 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1014 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1015 self._params = params
1017 def https_open(self, req):
1019 conn_class = self._https_conn_class
1021 if hasattr(self, '_context'): # python > 2.6
1022 kwargs['context'] = self._context
1023 if hasattr(self, '_check_hostname'): # python 3.x
1024 kwargs['check_hostname'] = self._check_hostname
1026 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1028 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1029 del req.headers['Ytdl-socks-proxy']
1031 return self.do_open(functools.partial(
1032 _create_http_connection, self, conn_class, True),
1036 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1037 def __init__(self, cookiejar=None):
1038 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1040 def http_response(self, request, response):
1041 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1042 # characters in Set-Cookie HTTP header of last response (see
1043 # https://github.com/rg3/youtube-dl/issues/6769).
1044 # In order to at least prevent crashing we will percent encode Set-Cookie
1045 # header before HTTPCookieProcessor starts processing it.
1046 # if sys.version_info < (3, 0) and response.headers:
1047 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1048 # set_cookie = response.headers.get(set_cookie_header)
1050 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1051 # if set_cookie != set_cookie_escaped:
1052 # del response.headers[set_cookie_header]
1053 # response.headers[set_cookie_header] = set_cookie_escaped
1054 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1056 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1057 https_response = http_response
1060 def extract_timezone(date_str):
1062 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1065 timezone = datetime.timedelta()
1067 date_str = date_str[:-len(m.group('tz'))]
1068 if not m.group('sign'):
1069 timezone = datetime.timedelta()
1071 sign = 1 if m.group('sign') == '+' else -1
1072 timezone = datetime.timedelta(
1073 hours=sign * int(m.group('hours')),
1074 minutes=sign * int(m.group('minutes')))
1075 return timezone, date_str
1078 def parse_iso8601(date_str, delimiter='T', timezone=None):
1079 """ Return a UNIX timestamp from the given date """
1081 if date_str is None:
1084 date_str = re.sub(r'\.[0-9]+', '', date_str)
1086 if timezone is None:
1087 timezone, date_str = extract_timezone(date_str)
1090 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1091 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1092 return calendar.timegm(dt.timetuple())
1097 def date_formats(day_first=True):
1098 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1101 def unified_strdate(date_str, day_first=True):
1102 """Return a string with the date in the format YYYYMMDD"""
1104 if date_str is None:
1108 date_str = date_str.replace(',', ' ')
1109 # Remove AM/PM + timezone
1110 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1111 _, date_str = extract_timezone(date_str)
1113 for expression in date_formats(day_first):
1115 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1118 if upload_date is None:
1119 timetuple = email.utils.parsedate_tz(date_str)
1122 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1125 if upload_date is not None:
1126 return compat_str(upload_date)
1129 def unified_timestamp(date_str, day_first=True):
1130 if date_str is None:
1133 date_str = date_str.replace(',', ' ')
1135 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1136 timezone, date_str = extract_timezone(date_str)
1138 # Remove AM/PM + timezone
1139 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1141 for expression in date_formats(day_first):
1143 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1144 return calendar.timegm(dt.timetuple())
1147 timetuple = email.utils.parsedate_tz(date_str)
1149 return calendar.timegm(timetuple) + pm_delta * 3600
1152 def determine_ext(url, default_ext='unknown_video'):
1155 guess = url.partition('?')[0].rpartition('.')[2]
1156 if re.match(r'^[A-Za-z0-9]+$', guess):
1158 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1159 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1160 return guess.rstrip('/')
1165 def subtitles_filename(filename, sub_lang, sub_format):
1166 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1169 def date_from_str(date_str):
1171 Return a datetime object from a string in the format YYYYMMDD or
1172 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1173 today = datetime.date.today()
1174 if date_str in ('now', 'today'):
1176 if date_str == 'yesterday':
1177 return today - datetime.timedelta(days=1)
1178 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1179 if match is not None:
1180 sign = match.group('sign')
1181 time = int(match.group('time'))
1184 unit = match.group('unit')
1185 # A bad approximation?
1189 elif unit == 'year':
1193 delta = datetime.timedelta(**{unit: time})
1194 return today + delta
1195 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1198 def hyphenate_date(date_str):
1200 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1201 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1202 if match is not None:
1203 return '-'.join(match.groups())
1208 class DateRange(object):
1209 """Represents a time interval between two dates"""
1211 def __init__(self, start=None, end=None):
1212 """start and end must be strings in the format accepted by date"""
1213 if start is not None:
1214 self.start = date_from_str(start)
1216 self.start = datetime.datetime.min.date()
1218 self.end = date_from_str(end)
1220 self.end = datetime.datetime.max.date()
1221 if self.start > self.end:
1222 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1226 """Returns a range that only contains the given day"""
1227 return cls(day, day)
1229 def __contains__(self, date):
1230 """Check if the date is in the range"""
1231 if not isinstance(date, datetime.date):
1232 date = date_from_str(date)
1233 return self.start <= date <= self.end
1236 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1239 def platform_name():
1240 """ Returns the platform name as a compat_str """
1241 res = platform.platform()
1242 if isinstance(res, bytes):
1243 res = res.decode(preferredencoding())
1245 assert isinstance(res, compat_str)
1249 def _windows_write_string(s, out):
1250 """ Returns True if the string was written using special methods,
1251 False if it has yet to be written out."""
1252 # Adapted from http://stackoverflow.com/a/3259271/35070
1255 import ctypes.wintypes
1263 fileno = out.fileno()
1264 except AttributeError:
1265 # If the output stream doesn't have a fileno, it's virtual
1267 except io.UnsupportedOperation:
1268 # Some strange Windows pseudo files?
1270 if fileno not in WIN_OUTPUT_IDS:
1273 GetStdHandle = ctypes.WINFUNCTYPE(
1274 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1275 (b'GetStdHandle', ctypes.windll.kernel32))
1276 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1278 WriteConsoleW = ctypes.WINFUNCTYPE(
1279 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1280 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1281 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1282 written = ctypes.wintypes.DWORD(0)
1284 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1285 FILE_TYPE_CHAR = 0x0002
1286 FILE_TYPE_REMOTE = 0x8000
1287 GetConsoleMode = ctypes.WINFUNCTYPE(
1288 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1289 ctypes.POINTER(ctypes.wintypes.DWORD))(
1290 (b'GetConsoleMode', ctypes.windll.kernel32))
1291 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1293 def not_a_console(handle):
1294 if handle == INVALID_HANDLE_VALUE or handle is None:
1296 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1297 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1299 if not_a_console(h):
1302 def next_nonbmp_pos(s):
1304 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1305 except StopIteration:
1309 count = min(next_nonbmp_pos(s), 1024)
1311 ret = WriteConsoleW(
1312 h, s, count if count else 2, ctypes.byref(written), None)
1314 raise OSError('Failed to write string')
1315 if not count: # We just wrote a non-BMP character
1316 assert written.value == 2
1319 assert written.value > 0
1320 s = s[written.value:]
1324 def write_string(s, out=None, encoding=None):
1327 assert type(s) == compat_str
1329 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1330 if _windows_write_string(s, out):
1333 if ('b' in getattr(out, 'mode', '') or
1334 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1335 byt = s.encode(encoding or preferredencoding(), 'ignore')
1337 elif hasattr(out, 'buffer'):
1338 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1339 byt = s.encode(enc, 'ignore')
1340 out.buffer.write(byt)
1346 def bytes_to_intlist(bs):
1349 if isinstance(bs[0], int): # Python 3
1352 return [ord(c) for c in bs]
1355 def intlist_to_bytes(xs):
1358 return compat_struct_pack('%dB' % len(xs), *xs)
1361 # Cross-platform file locking
1362 if sys.platform == 'win32':
1363 import ctypes.wintypes
1366 class OVERLAPPED(ctypes.Structure):
1368 ('Internal', ctypes.wintypes.LPVOID),
1369 ('InternalHigh', ctypes.wintypes.LPVOID),
1370 ('Offset', ctypes.wintypes.DWORD),
1371 ('OffsetHigh', ctypes.wintypes.DWORD),
1372 ('hEvent', ctypes.wintypes.HANDLE),
1375 kernel32 = ctypes.windll.kernel32
1376 LockFileEx = kernel32.LockFileEx
1377 LockFileEx.argtypes = [
1378 ctypes.wintypes.HANDLE, # hFile
1379 ctypes.wintypes.DWORD, # dwFlags
1380 ctypes.wintypes.DWORD, # dwReserved
1381 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1382 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1383 ctypes.POINTER(OVERLAPPED) # Overlapped
1385 LockFileEx.restype = ctypes.wintypes.BOOL
1386 UnlockFileEx = kernel32.UnlockFileEx
1387 UnlockFileEx.argtypes = [
1388 ctypes.wintypes.HANDLE, # hFile
1389 ctypes.wintypes.DWORD, # dwReserved
1390 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1391 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1392 ctypes.POINTER(OVERLAPPED) # Overlapped
1394 UnlockFileEx.restype = ctypes.wintypes.BOOL
1395 whole_low = 0xffffffff
1396 whole_high = 0x7fffffff
1398 def _lock_file(f, exclusive):
1399 overlapped = OVERLAPPED()
1400 overlapped.Offset = 0
1401 overlapped.OffsetHigh = 0
1402 overlapped.hEvent = 0
1403 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1404 handle = msvcrt.get_osfhandle(f.fileno())
1405 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1406 whole_low, whole_high, f._lock_file_overlapped_p):
1407 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1409 def _unlock_file(f):
1410 assert f._lock_file_overlapped_p
1411 handle = msvcrt.get_osfhandle(f.fileno())
1412 if not UnlockFileEx(handle, 0,
1413 whole_low, whole_high, f._lock_file_overlapped_p):
1414 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1417 # Some platforms, such as Jython, is missing fcntl
1421 def _lock_file(f, exclusive):
1422 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1424 def _unlock_file(f):
1425 fcntl.flock(f, fcntl.LOCK_UN)
1427 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1429 def _lock_file(f, exclusive):
1430 raise IOError(UNSUPPORTED_MSG)
1432 def _unlock_file(f):
1433 raise IOError(UNSUPPORTED_MSG)
1436 class locked_file(object):
1437 def __init__(self, filename, mode, encoding=None):
1438 assert mode in ['r', 'a', 'w']
1439 self.f = io.open(filename, mode, encoding=encoding)
1442 def __enter__(self):
1443 exclusive = self.mode != 'r'
1445 _lock_file(self.f, exclusive)
1451 def __exit__(self, etype, value, traceback):
1453 _unlock_file(self.f)
1460 def write(self, *args):
1461 return self.f.write(*args)
1463 def read(self, *args):
1464 return self.f.read(*args)
1467 def get_filesystem_encoding():
1468 encoding = sys.getfilesystemencoding()
1469 return encoding if encoding is not None else 'utf-8'
1472 def shell_quote(args):
1474 encoding = get_filesystem_encoding()
1476 if isinstance(a, bytes):
1477 # We may get a filename encoded with 'encodeFilename'
1478 a = a.decode(encoding)
1479 quoted_args.append(pipes.quote(a))
1480 return ' '.join(quoted_args)
1483 def smuggle_url(url, data):
1484 """ Pass additional data in a URL for internal use. """
1486 url, idata = unsmuggle_url(url, {})
1488 sdata = compat_urllib_parse_urlencode(
1489 {'__youtubedl_smuggle': json.dumps(data)})
1490 return url + '#' + sdata
1493 def unsmuggle_url(smug_url, default=None):
1494 if '#__youtubedl_smuggle' not in smug_url:
1495 return smug_url, default
1496 url, _, sdata = smug_url.rpartition('#')
1497 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1498 data = json.loads(jsond)
1502 def format_bytes(bytes):
1505 if type(bytes) is str:
1506 bytes = float(bytes)
1510 exponent = int(math.log(bytes, 1024.0))
1511 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1512 converted = float(bytes) / float(1024 ** exponent)
1513 return '%.2f%s' % (converted, suffix)
1516 def lookup_unit_table(unit_table, s):
1517 units_re = '|'.join(re.escape(u) for u in unit_table)
1519 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1522 num_str = m.group('num').replace(',', '.')
1523 mult = unit_table[m.group('unit')]
1524 return int(float(num_str) * mult)
1527 def parse_filesize(s):
1531 # The lower-case forms are of course incorrect and unofficial,
1532 # but we support those too
1549 'megabytes': 1000 ** 2,
1550 'mebibytes': 1024 ** 2,
1556 'gigabytes': 1000 ** 3,
1557 'gibibytes': 1024 ** 3,
1563 'terabytes': 1000 ** 4,
1564 'tebibytes': 1024 ** 4,
1570 'petabytes': 1000 ** 5,
1571 'pebibytes': 1024 ** 5,
1577 'exabytes': 1000 ** 6,
1578 'exbibytes': 1024 ** 6,
1584 'zettabytes': 1000 ** 7,
1585 'zebibytes': 1024 ** 7,
1591 'yottabytes': 1000 ** 8,
1592 'yobibytes': 1024 ** 8,
1595 return lookup_unit_table(_UNIT_TABLE, s)
1604 if re.match(r'^[\d,.]+$', s):
1605 return str_to_int(s)
1616 return lookup_unit_table(_UNIT_TABLE, s)
1619 def month_by_name(name, lang='en'):
1620 """ Return the number of a month by (locale-independently) English name """
1622 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1625 return month_names.index(name) + 1
1630 def month_by_abbreviation(abbrev):
1631 """ Return the number of a month by (locale-independently) English
1635 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1640 def fix_xml_ampersands(xml_str):
1641 """Replace all the '&' by '&' in XML"""
1643 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1648 def setproctitle(title):
1649 assert isinstance(title, compat_str)
1651 # ctypes in Jython is not complete
1652 # http://bugs.jython.org/issue2148
1653 if sys.platform.startswith('java'):
1657 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1660 title_bytes = title.encode('utf-8')
1661 buf = ctypes.create_string_buffer(len(title_bytes))
1662 buf.value = title_bytes
1664 libc.prctl(15, buf, 0, 0, 0)
1665 except AttributeError:
1666 return # Strange libc, just skip this
1669 def remove_start(s, start):
1670 return s[len(start):] if s is not None and s.startswith(start) else s
1673 def remove_end(s, end):
1674 return s[:-len(end)] if s is not None and s.endswith(end) else s
1677 def remove_quotes(s):
1678 if s is None or len(s) < 2:
1680 for quote in ('"', "'", ):
1681 if s[0] == quote and s[-1] == quote:
1686 def url_basename(url):
1687 path = compat_urlparse.urlparse(url).path
1688 return path.strip('/').split('/')[-1]
1691 class HEADRequest(compat_urllib_request.Request):
1692 def get_method(self):
1696 class PUTRequest(compat_urllib_request.Request):
1697 def get_method(self):
1701 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1704 v = getattr(v, get_attr, None)
1710 return int(v) * invscale // scale
1715 def str_or_none(v, default=None):
1716 return default if v is None else compat_str(v)
1719 def str_to_int(int_str):
1720 """ A more relaxed version of int_or_none """
1723 int_str = re.sub(r'[,\.\+]', '', int_str)
1727 def float_or_none(v, scale=1, invscale=1, default=None):
1731 return float(v) * invscale / scale
1736 def strip_or_none(v):
1737 return None if v is None else v.strip()
1740 def parse_duration(s):
1741 if not isinstance(s, compat_basestring):
1746 days, hours, mins, secs, ms = [None] * 5
1747 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1749 days, hours, mins, secs, ms = m.groups()
1754 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1757 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1760 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1763 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1766 days, hours, mins, secs, ms = m.groups()
1768 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1770 hours, mins = m.groups()
1776 duration += float(secs)
1778 duration += float(mins) * 60
1780 duration += float(hours) * 60 * 60
1782 duration += float(days) * 24 * 60 * 60
1784 duration += float(ms)
1788 def prepend_extension(filename, ext, expected_real_ext=None):
1789 name, real_ext = os.path.splitext(filename)
1791 '{0}.{1}{2}'.format(name, ext, real_ext)
1792 if not expected_real_ext or real_ext[1:] == expected_real_ext
1793 else '{0}.{1}'.format(filename, ext))
1796 def replace_extension(filename, ext, expected_real_ext=None):
1797 name, real_ext = os.path.splitext(filename)
1798 return '{0}.{1}'.format(
1799 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1803 def check_executable(exe, args=[]):
1804 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1805 args can be a list of arguments for a short output (like -version) """
1807 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1813 def get_exe_version(exe, args=['--version'],
1814 version_re=None, unrecognized='present'):
1815 """ Returns the version of the specified executable,
1816 or False if the executable is not present """
1818 out, _ = subprocess.Popen(
1819 [encodeArgument(exe)] + args,
1820 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1823 if isinstance(out, bytes): # Python 2.x
1824 out = out.decode('ascii', 'ignore')
1825 return detect_exe_version(out, version_re, unrecognized)
1828 def detect_exe_version(output, version_re=None, unrecognized='present'):
1829 assert isinstance(output, compat_str)
1830 if version_re is None:
1831 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1832 m = re.search(version_re, output)
1839 class PagedList(object):
1841 # This is only useful for tests
1842 return len(self.getslice())
1845 class OnDemandPagedList(PagedList):
1846 def __init__(self, pagefunc, pagesize, use_cache=False):
1847 self._pagefunc = pagefunc
1848 self._pagesize = pagesize
1849 self._use_cache = use_cache
1853 def getslice(self, start=0, end=None):
1855 for pagenum in itertools.count(start // self._pagesize):
1856 firstid = pagenum * self._pagesize
1857 nextfirstid = pagenum * self._pagesize + self._pagesize
1858 if start >= nextfirstid:
1863 page_results = self._cache.get(pagenum)
1864 if page_results is None:
1865 page_results = list(self._pagefunc(pagenum))
1867 self._cache[pagenum] = page_results
1870 start % self._pagesize
1871 if firstid <= start < nextfirstid
1875 ((end - 1) % self._pagesize) + 1
1876 if (end is not None and firstid <= end <= nextfirstid)
1879 if startv != 0 or endv is not None:
1880 page_results = page_results[startv:endv]
1881 res.extend(page_results)
1883 # A little optimization - if current page is not "full", ie. does
1884 # not contain page_size videos then we can assume that this page
1885 # is the last one - there are no more ids on further pages -
1886 # i.e. no need to query again.
1887 if len(page_results) + startv < self._pagesize:
1890 # If we got the whole page, but the next page is not interesting,
1891 # break out early as well
1892 if end == nextfirstid:
1897 class InAdvancePagedList(PagedList):
1898 def __init__(self, pagefunc, pagecount, pagesize):
1899 self._pagefunc = pagefunc
1900 self._pagecount = pagecount
1901 self._pagesize = pagesize
1903 def getslice(self, start=0, end=None):
1905 start_page = start // self._pagesize
1907 self._pagecount if end is None else (end // self._pagesize + 1))
1908 skip_elems = start - start_page * self._pagesize
1909 only_more = None if end is None else end - start
1910 for pagenum in range(start_page, end_page):
1911 page = list(self._pagefunc(pagenum))
1913 page = page[skip_elems:]
1915 if only_more is not None:
1916 if len(page) < only_more:
1917 only_more -= len(page)
1919 page = page[:only_more]
1926 def uppercase_escape(s):
1927 unicode_escape = codecs.getdecoder('unicode_escape')
1929 r'\\U[0-9a-fA-F]{8}',
1930 lambda m: unicode_escape(m.group(0))[0],
1934 def lowercase_escape(s):
1935 unicode_escape = codecs.getdecoder('unicode_escape')
1937 r'\\u[0-9a-fA-F]{4}',
1938 lambda m: unicode_escape(m.group(0))[0],
1942 def escape_rfc3986(s):
1943 """Escape non-ASCII characters as suggested by RFC 3986"""
1944 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1945 s = s.encode('utf-8')
1946 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1949 def escape_url(url):
1950 """Escape URL as suggested by RFC 3986"""
1951 url_parsed = compat_urllib_parse_urlparse(url)
1952 return url_parsed._replace(
1953 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1954 path=escape_rfc3986(url_parsed.path),
1955 params=escape_rfc3986(url_parsed.params),
1956 query=escape_rfc3986(url_parsed.query),
1957 fragment=escape_rfc3986(url_parsed.fragment)
1961 def read_batch_urls(batch_fd):
1963 if not isinstance(url, compat_str):
1964 url = url.decode('utf-8', 'replace')
1965 BOM_UTF8 = '\xef\xbb\xbf'
1966 if url.startswith(BOM_UTF8):
1967 url = url[len(BOM_UTF8):]
1969 if url.startswith(('#', ';', ']')):
1973 with contextlib.closing(batch_fd) as fd:
1974 return [url for url in map(fixup, fd) if url]
1977 def urlencode_postdata(*args, **kargs):
1978 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1981 def update_url_query(url, query):
1984 parsed_url = compat_urlparse.urlparse(url)
1985 qs = compat_parse_qs(parsed_url.query)
1987 return compat_urlparse.urlunparse(parsed_url._replace(
1988 query=compat_urllib_parse_urlencode(qs, True)))
1991 def update_Request(req, url=None, data=None, headers={}, query={}):
1992 req_headers = req.headers.copy()
1993 req_headers.update(headers)
1994 req_data = data or req.data
1995 req_url = update_url_query(url or req.get_full_url(), query)
1996 req_get_method = req.get_method()
1997 if req_get_method == 'HEAD':
1998 req_type = HEADRequest
1999 elif req_get_method == 'PUT':
2000 req_type = PUTRequest
2002 req_type = compat_urllib_request.Request
2004 req_url, data=req_data, headers=req_headers,
2005 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2006 if hasattr(req, 'timeout'):
2007 new_req.timeout = req.timeout
2011 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2012 if isinstance(key_or_keys, (list, tuple)):
2013 for key in key_or_keys:
2014 if key not in d or d[key] is None or skip_false_values and not d[key]:
2018 return d.get(key_or_keys, default)
2021 def try_get(src, getter, expected_type=None):
2024 except (AttributeError, KeyError, TypeError, IndexError):
2027 if expected_type is None or isinstance(v, expected_type):
2031 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2032 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2044 TV_PARENTAL_GUIDELINES = {
2054 def parse_age_limit(s):
2056 return s if 0 <= s <= 21 else None
2057 if not isinstance(s, compat_basestring):
2059 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2061 return int(m.group('age'))
2063 return US_RATINGS[s]
2064 return TV_PARENTAL_GUIDELINES.get(s)
2067 def strip_jsonp(code):
2069 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2072 def js_to_json(code):
2075 if v in ('true', 'false', 'null'):
2077 elif v.startswith('/*') or v == ',':
2080 if v[0] in ("'", '"'):
2081 v = re.sub(r'(?s)\\.|"', lambda m: {
2086 }.get(m.group(0), m.group(0)), v[1:-1])
2089 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2090 (r'^(0+[0-7]+)\s*:?$', 8),
2093 for regex, base in INTEGER_TABLE:
2094 im = re.match(regex, v)
2096 i = int(im.group(1), base)
2097 return '"%d":' % i if v.endswith(':') else '%d' % i
2101 return re.sub(r'''(?sx)
2102 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2103 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2104 /\*.*?\*/|,(?=\s*[\]}])|
2105 [a-zA-Z_][.a-zA-Z_0-9]*|
2106 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2111 def qualities(quality_ids):
2112 """ Get a numeric quality value out of a list of possible values """
2115 return quality_ids.index(qid)
2121 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2124 def limit_length(s, length):
2125 """ Add ellipses to overly long strings """
2130 return s[:length - len(ELLIPSES)] + ELLIPSES
2134 def version_tuple(v):
2135 return tuple(int(e) for e in re.split(r'[-.]', v))
2138 def is_outdated_version(version, limit, assume_new=True):
2140 return not assume_new
2142 return version_tuple(version) < version_tuple(limit)
2144 return not assume_new
2147 def ytdl_is_updateable():
2148 """ Returns if youtube-dl can be updated with -U """
2149 from zipimport import zipimporter
2151 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2154 def args_to_str(args):
2155 # Get a short string representation for a subprocess command
2156 return ' '.join(compat_shlex_quote(a) for a in args)
2159 def error_to_compat_str(err):
2161 # On python 2 error byte string must be decoded with proper
2162 # encoding rather than ascii
2163 if sys.version_info[0] < 3:
2164 err_str = err_str.decode(preferredencoding())
2168 def mimetype2ext(mt):
2174 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2175 # it's the most popular one
2176 'audio/mpeg': 'mp3',
2181 _, _, res = mt.rpartition('/')
2182 res = res.split(';')[0].strip().lower()
2186 'smptett+xml': 'tt',
2192 'x-mp4-fragmented': 'mp4',
2195 'x-mpegurl': 'm3u8',
2196 'vnd.apple.mpegurl': 'm3u8',
2201 'vnd.ms-sstr+xml': 'ism',
2206 def parse_codecs(codecs_str):
2207 # http://tools.ietf.org/html/rfc6381
2210 splited_codecs = list(filter(None, map(
2211 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2212 vcodec, acodec = None, None
2213 for full_codec in splited_codecs:
2214 codec = full_codec.split('.')[0]
2215 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2218 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2222 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2223 if not vcodec and not acodec:
2224 if len(splited_codecs) == 2:
2229 elif len(splited_codecs) == 1:
2236 'vcodec': vcodec or 'none',
2237 'acodec': acodec or 'none',
2242 def urlhandle_detect_ext(url_handle):
2243 getheader = url_handle.headers.get
2245 cd = getheader('Content-Disposition')
2247 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2249 e = determine_ext(m.group('filename'), default_ext=None)
2253 return mimetype2ext(getheader('Content-Type'))
2256 def encode_data_uri(data, mime_type):
2257 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2260 def age_restricted(content_limit, age_limit):
2261 """ Returns True iff the content should be blocked """
2263 if age_limit is None: # No limit set
2265 if content_limit is None:
2266 return False # Content available for everyone
2267 return age_limit < content_limit
2270 def is_html(first_bytes):
2271 """ Detect whether a file contains HTML by examining its first bytes. """
2274 (b'\xef\xbb\xbf', 'utf-8'),
2275 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2276 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2277 (b'\xff\xfe', 'utf-16-le'),
2278 (b'\xfe\xff', 'utf-16-be'),
2280 for bom, enc in BOMS:
2281 if first_bytes.startswith(bom):
2282 s = first_bytes[len(bom):].decode(enc, 'replace')
2285 s = first_bytes.decode('utf-8', 'replace')
2287 return re.match(r'^\s*<', s)
2290 def determine_protocol(info_dict):
2291 protocol = info_dict.get('protocol')
2292 if protocol is not None:
2295 url = info_dict['url']
2296 if url.startswith('rtmp'):
2298 elif url.startswith('mms'):
2300 elif url.startswith('rtsp'):
2303 ext = determine_ext(url)
2309 return compat_urllib_parse_urlparse(url).scheme
2312 def render_table(header_row, data):
2313 """ Render a list of rows, each as a list of values """
2314 table = [header_row] + data
2315 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2316 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2317 return '\n'.join(format_str % tuple(row) for row in table)
2320 def _match_one(filter_part, dct):
2321 COMPARISON_OPERATORS = {
2329 operator_rex = re.compile(r'''(?x)\s*
2331 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2333 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2334 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2337 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2338 m = operator_rex.search(filter_part)
2340 op = COMPARISON_OPERATORS[m.group('op')]
2341 if m.group('strval') is not None:
2342 if m.group('op') not in ('=', '!='):
2344 'Operator %s does not support string values!' % m.group('op'))
2345 comparison_value = m.group('strval')
2348 comparison_value = int(m.group('intval'))
2350 comparison_value = parse_filesize(m.group('intval'))
2351 if comparison_value is None:
2352 comparison_value = parse_filesize(m.group('intval') + 'B')
2353 if comparison_value is None:
2355 'Invalid integer value %r in filter part %r' % (
2356 m.group('intval'), filter_part))
2357 actual_value = dct.get(m.group('key'))
2358 if actual_value is None:
2359 return m.group('none_inclusive')
2360 return op(actual_value, comparison_value)
2363 '': lambda v: v is not None,
2364 '!': lambda v: v is None,
2366 operator_rex = re.compile(r'''(?x)\s*
2367 (?P<op>%s)\s*(?P<key>[a-z_]+)
2369 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2370 m = operator_rex.search(filter_part)
2372 op = UNARY_OPERATORS[m.group('op')]
2373 actual_value = dct.get(m.group('key'))
2374 return op(actual_value)
2376 raise ValueError('Invalid filter part %r' % filter_part)
2379 def match_str(filter_str, dct):
2380 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2383 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2386 def match_filter_func(filter_str):
2387 def _match_func(info_dict):
2388 if match_str(filter_str, info_dict):
2391 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2392 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2396 def parse_dfxp_time_expr(time_expr):
2400 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2402 return float(mobj.group('time_offset'))
2404 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2406 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2409 def srt_subtitles_timecode(seconds):
2410 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2413 def dfxp2srt(dfxp_data):
2414 _x = functools.partial(xpath_with_ns, ns_map={
2415 'ttml': 'http://www.w3.org/ns/ttml',
2416 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2417 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2420 class TTMLPElementParser(object):
2423 def start(self, tag, attrib):
2424 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2430 def data(self, data):
2434 return self.out.strip()
2436 def parse_node(node):
2437 target = TTMLPElementParser()
2438 parser = xml.etree.ElementTree.XMLParser(target=target)
2439 parser.feed(xml.etree.ElementTree.tostring(node))
2440 return parser.close()
2442 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2444 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2447 raise ValueError('Invalid dfxp/TTML subtitle')
2449 for para, index in zip(paras, itertools.count(1)):
2450 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2451 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2452 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2453 if begin_time is None:
2458 end_time = begin_time + dur
2459 out.append('%d\n%s --> %s\n%s\n\n' % (
2461 srt_subtitles_timecode(begin_time),
2462 srt_subtitles_timecode(end_time),
2468 def cli_option(params, command_option, param):
2469 param = params.get(param)
2471 param = compat_str(param)
2472 return [command_option, param] if param is not None else []
2475 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2476 param = params.get(param)
2477 assert isinstance(param, bool)
2479 return [command_option + separator + (true_value if param else false_value)]
2480 return [command_option, true_value if param else false_value]
2483 def cli_valueless_option(params, command_option, param, expected_value=True):
2484 param = params.get(param)
2485 return [command_option] if param == expected_value else []
2488 def cli_configuration_args(params, param, default=[]):
2489 ex_args = params.get(param)
2492 assert isinstance(ex_args, list)
2496 class ISO639Utils(object):
2497 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2686 def short2long(cls, code):
2687 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2688 return cls._lang_map.get(code[:2])
2691 def long2short(cls, code):
2692 """Convert language code from ISO 639-2/T to ISO 639-1"""
2693 for short_name, long_name in cls._lang_map.items():
2694 if long_name == code:
2698 class ISO3166Utils(object):
2699 # From http://data.okfn.org/data/core/country-list
2701 'AF': 'Afghanistan',
2702 'AX': 'Åland Islands',
2705 'AS': 'American Samoa',
2710 'AG': 'Antigua and Barbuda',
2727 'BO': 'Bolivia, Plurinational State of',
2728 'BQ': 'Bonaire, Sint Eustatius and Saba',
2729 'BA': 'Bosnia and Herzegovina',
2731 'BV': 'Bouvet Island',
2733 'IO': 'British Indian Ocean Territory',
2734 'BN': 'Brunei Darussalam',
2736 'BF': 'Burkina Faso',
2742 'KY': 'Cayman Islands',
2743 'CF': 'Central African Republic',
2747 'CX': 'Christmas Island',
2748 'CC': 'Cocos (Keeling) Islands',
2752 'CD': 'Congo, the Democratic Republic of the',
2753 'CK': 'Cook Islands',
2755 'CI': 'Côte d\'Ivoire',
2760 'CZ': 'Czech Republic',
2764 'DO': 'Dominican Republic',
2767 'SV': 'El Salvador',
2768 'GQ': 'Equatorial Guinea',
2772 'FK': 'Falkland Islands (Malvinas)',
2773 'FO': 'Faroe Islands',
2777 'GF': 'French Guiana',
2778 'PF': 'French Polynesia',
2779 'TF': 'French Southern Territories',
2794 'GW': 'Guinea-Bissau',
2797 'HM': 'Heard Island and McDonald Islands',
2798 'VA': 'Holy See (Vatican City State)',
2805 'IR': 'Iran, Islamic Republic of',
2808 'IM': 'Isle of Man',
2818 'KP': 'Korea, Democratic People\'s Republic of',
2819 'KR': 'Korea, Republic of',
2822 'LA': 'Lao People\'s Democratic Republic',
2828 'LI': 'Liechtenstein',
2832 'MK': 'Macedonia, the Former Yugoslav Republic of',
2839 'MH': 'Marshall Islands',
2845 'FM': 'Micronesia, Federated States of',
2846 'MD': 'Moldova, Republic of',
2857 'NL': 'Netherlands',
2858 'NC': 'New Caledonia',
2859 'NZ': 'New Zealand',
2864 'NF': 'Norfolk Island',
2865 'MP': 'Northern Mariana Islands',
2870 'PS': 'Palestine, State of',
2872 'PG': 'Papua New Guinea',
2875 'PH': 'Philippines',
2879 'PR': 'Puerto Rico',
2883 'RU': 'Russian Federation',
2885 'BL': 'Saint Barthélemy',
2886 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2887 'KN': 'Saint Kitts and Nevis',
2888 'LC': 'Saint Lucia',
2889 'MF': 'Saint Martin (French part)',
2890 'PM': 'Saint Pierre and Miquelon',
2891 'VC': 'Saint Vincent and the Grenadines',
2894 'ST': 'Sao Tome and Principe',
2895 'SA': 'Saudi Arabia',
2899 'SL': 'Sierra Leone',
2901 'SX': 'Sint Maarten (Dutch part)',
2904 'SB': 'Solomon Islands',
2906 'ZA': 'South Africa',
2907 'GS': 'South Georgia and the South Sandwich Islands',
2908 'SS': 'South Sudan',
2913 'SJ': 'Svalbard and Jan Mayen',
2916 'CH': 'Switzerland',
2917 'SY': 'Syrian Arab Republic',
2918 'TW': 'Taiwan, Province of China',
2920 'TZ': 'Tanzania, United Republic of',
2922 'TL': 'Timor-Leste',
2926 'TT': 'Trinidad and Tobago',
2929 'TM': 'Turkmenistan',
2930 'TC': 'Turks and Caicos Islands',
2934 'AE': 'United Arab Emirates',
2935 'GB': 'United Kingdom',
2936 'US': 'United States',
2937 'UM': 'United States Minor Outlying Islands',
2941 'VE': 'Venezuela, Bolivarian Republic of',
2943 'VG': 'Virgin Islands, British',
2944 'VI': 'Virgin Islands, U.S.',
2945 'WF': 'Wallis and Futuna',
2946 'EH': 'Western Sahara',
2953 def short2full(cls, code):
2954 """Convert an ISO 3166-2 country code to the corresponding full name"""
2955 return cls._country_map.get(code.upper())
2958 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2959 def __init__(self, proxies=None):
2960 # Set default handlers
2961 for type in ('http', 'https'):
2962 setattr(self, '%s_open' % type,
2963 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2964 meth(r, proxy, type))
2965 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2967 def proxy_open(self, req, proxy, type):
2968 req_proxy = req.headers.get('Ytdl-request-proxy')
2969 if req_proxy is not None:
2971 del req.headers['Ytdl-request-proxy']
2973 if proxy == '__noproxy__':
2974 return None # No Proxy
2975 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2976 req.add_header('Ytdl-socks-proxy', proxy)
2977 # youtube-dl's http/https handlers do wrapping the socket with socks
2979 return compat_urllib_request.ProxyHandler.proxy_open(
2980 self, req, proxy, type)
2983 def ohdave_rsa_encrypt(data, exponent, modulus):
2985 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2988 data: data to encrypt, bytes-like object
2989 exponent, modulus: parameter e and N of RSA algorithm, both integer
2990 Output: hex string of encrypted data
2992 Limitation: supports one block encryption only
2995 payload = int(binascii.hexlify(data[::-1]), 16)
2996 encrypted = pow(payload, exponent, modulus)
2997 return '%x' % encrypted
3000 def encode_base_n(num, n, table=None):
3001 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3003 table = FULL_TABLE[:n]
3006 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3013 ret = table[num % n] + ret
3018 def decode_packed_codes(code):
3020 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
3022 obfucasted_code, base, count, symbols = mobj.groups()
3025 symbols = symbols.split('|')
3030 base_n_count = encode_base_n(count, base)
3031 symbol_table[base_n_count] = symbols[count] or base_n_count
3034 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3038 def parse_m3u8_attributes(attrib):
3040 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3041 if val.startswith('"'):
3047 def urshift(val, n):
3048 return val >> n if val >= 0 else (val + 0x100000000) >> n
3051 # Based on png2str() written by @gdkchan and improved by @yokrysty
3052 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3053 def decode_png(png_data):
3054 # Reference: https://www.w3.org/TR/PNG/
3055 header = png_data[8:]
3057 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3058 raise IOError('Not a valid PNG file.')
3060 int_map = {1: '>B', 2: '>H', 4: '>I'}
3061 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3066 length = unpack_integer(header[:4])
3069 chunk_type = header[:4]
3072 chunk_data = header[:length]
3073 header = header[length:]
3075 header = header[4:] # Skip CRC
3083 ihdr = chunks[0]['data']
3085 width = unpack_integer(ihdr[:4])
3086 height = unpack_integer(ihdr[4:8])
3090 for chunk in chunks:
3091 if chunk['type'] == b'IDAT':
3092 idat += chunk['data']
3095 raise IOError('Unable to read PNG data.')
3097 decompressed_data = bytearray(zlib.decompress(idat))
3102 def _get_pixel(idx):
3107 for y in range(height):
3108 basePos = y * (1 + stride)
3109 filter_type = decompressed_data[basePos]
3113 pixels.append(current_row)
3115 for x in range(stride):
3116 color = decompressed_data[1 + basePos + x]
3117 basex = y * stride + x
3122 left = _get_pixel(basex - 3)
3124 up = _get_pixel(basex - stride)
3126 if filter_type == 1: # Sub
3127 color = (color + left) & 0xff
3128 elif filter_type == 2: # Up
3129 color = (color + up) & 0xff
3130 elif filter_type == 3: # Average
3131 color = (color + ((left + up) >> 1)) & 0xff
3132 elif filter_type == 4: # Paeth
3138 c = _get_pixel(basex - stride - 3)
3146 if pa <= pb and pa <= pc:
3147 color = (color + a) & 0xff
3149 color = (color + b) & 0xff
3151 color = (color + c) & 0xff
3153 current_row.append(color)
3155 return width, height, pixels
3158 def write_xattr(path, key, value):
3159 # This mess below finds the best xattr tool for the job
3161 # try the pyxattr module...
3164 # Unicode arguments are not supported in python-pyxattr until
3166 # See https://github.com/rg3/youtube-dl/issues/5498
3167 pyxattr_required_version = '0.5.0'
3168 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3169 # TODO: fallback to CLI tools
3170 raise XAttrUnavailableError(
3171 'python-pyxattr is detected but is too old. '
3172 'youtube-dl requires %s or above while your version is %s. '
3173 'Falling back to other xattr implementations' % (
3174 pyxattr_required_version, xattr.__version__))
3177 xattr.set(path, key, value)
3178 except EnvironmentError as e:
3179 raise XAttrMetadataError(e.errno, e.strerror)
3182 if compat_os_name == 'nt':
3183 # Write xattrs to NTFS Alternate Data Streams:
3184 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3185 assert ':' not in key
3186 assert os.path.exists(path)
3188 ads_fn = path + ':' + key
3190 with open(ads_fn, 'wb') as f:
3192 except EnvironmentError as e:
3193 raise XAttrMetadataError(e.errno, e.strerror)
3195 user_has_setfattr = check_executable('setfattr', ['--version'])
3196 user_has_xattr = check_executable('xattr', ['-h'])
3198 if user_has_setfattr or user_has_xattr:
3200 value = value.decode('utf-8')
3201 if user_has_setfattr:
3202 executable = 'setfattr'
3203 opts = ['-n', key, '-v', value]
3204 elif user_has_xattr:
3205 executable = 'xattr'
3206 opts = ['-w', key, value]
3208 cmd = ([encodeFilename(executable, True)] +
3209 [encodeArgument(o) for o in opts] +
3210 [encodeFilename(path, True)])
3213 p = subprocess.Popen(
3214 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3215 except EnvironmentError as e:
3216 raise XAttrMetadataError(e.errno, e.strerror)
3217 stdout, stderr = p.communicate()
3218 stderr = stderr.decode('utf-8', 'replace')
3219 if p.returncode != 0:
3220 raise XAttrMetadataError(p.returncode, stderr)
3223 # On Unix, and can't find pyxattr, setfattr, or xattr.
3224 if sys.platform.startswith('linux'):
3225 raise XAttrUnavailableError(
3226 "Couldn't find a tool to set the xattrs. "
3227 "Install either the python 'pyxattr' or 'xattr' "
3228 "modules, or the GNU 'attr' package "
3229 "(which contains the 'setfattr' tool).")
3231 raise XAttrUnavailableError(
3232 "Couldn't find a tool to set the xattrs. "
3233 "Install either the python 'xattr' module, "
3234 "or the 'xattr' binary.")